YARN-9280. Backport YARN-6620 to YARN-8200/branch-2 for NodeManager-side GPU isolation
This commit is contained in:
parent
631dfc7277
commit
7ec4d7c6ce
|
@ -18,10 +18,13 @@
|
|||
|
||||
package org.apache.hadoop.yarn.api.records;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
|
||||
import org.apache.hadoop.yarn.util.UnitsConversionUtil;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Class to encapsulate information about a Resource - the name of the resource,
|
||||
* the units(milli, micro, etc), the type(countable), and the value.
|
||||
|
@ -35,13 +38,20 @@ public class ResourceInformation implements Comparable<ResourceInformation> {
|
|||
private long minimumAllocation;
|
||||
private long maximumAllocation;
|
||||
|
||||
// Known resource types
|
||||
public static final String MEMORY_URI = "memory-mb";
|
||||
public static final String VCORES_URI = "vcores";
|
||||
public static final String GPU_URI = "yarn.io/gpu";
|
||||
|
||||
public static final ResourceInformation MEMORY_MB =
|
||||
ResourceInformation.newInstance(MEMORY_URI, "Mi");
|
||||
public static final ResourceInformation VCORES =
|
||||
ResourceInformation.newInstance(VCORES_URI);
|
||||
public static final ResourceInformation GPUS =
|
||||
ResourceInformation.newInstance(GPU_URI);
|
||||
|
||||
public static final Map<String, ResourceInformation> MANDATORY_RESOURCES =
|
||||
ImmutableMap.of(MEMORY_URI, MEMORY_MB, VCORES_URI, VCORES, GPU_URI, GPUS);
|
||||
|
||||
/**
|
||||
* Get the name for the resource.
|
||||
|
|
|
@ -1411,6 +1411,39 @@ public class YarnConfiguration extends Configuration {
|
|||
public static final String NM_NETWORK_RESOURCE_OUTBOUND_BANDWIDTH_YARN_MBIT =
|
||||
NM_NETWORK_RESOURCE_PREFIX + "outbound-bandwidth-yarn-mbit";
|
||||
|
||||
/**
|
||||
* Prefix for computation resources, example of computation resources like
|
||||
* GPU / FPGA / TPU, etc.
|
||||
*/
|
||||
@Private
|
||||
public static final String NM_RESOURCE_PLUGINS =
|
||||
NM_PREFIX + "resource-plugins";
|
||||
|
||||
/**
|
||||
* Prefix for gpu configurations. Work in progress: This configuration
|
||||
* parameter may be changed/removed in the future.
|
||||
*/
|
||||
@Private
|
||||
public static final String NM_GPU_RESOURCE_PREFIX =
|
||||
NM_RESOURCE_PLUGINS + ".gpu.";
|
||||
|
||||
@Private
|
||||
public static final String NM_GPU_ALLOWED_DEVICES =
|
||||
NM_GPU_RESOURCE_PREFIX + "allowed-gpu-devices";
|
||||
@Private
|
||||
public static final String AUTOMATICALLY_DISCOVER_GPU_DEVICES = "auto";
|
||||
|
||||
/**
|
||||
* This setting controls where to how to invoke GPU binaries
|
||||
*/
|
||||
@Private
|
||||
public static final String NM_GPU_PATH_TO_EXEC =
|
||||
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
|
||||
|
||||
@Private
|
||||
public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = "";
|
||||
|
||||
|
||||
/** NM Webapp address.**/
|
||||
public static final String NM_WEBAPP_ADDRESS = NM_PREFIX + "webapp.address";
|
||||
public static final int DEFAULT_NM_WEBAPP_PORT = 8042;
|
||||
|
|
|
@ -46,6 +46,8 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
|
||||
|
||||
/**
|
||||
* Helper class to read the resource-types to be supported by the system.
|
||||
*/
|
||||
|
@ -82,33 +84,32 @@ public class ResourceUtils {
|
|||
*/
|
||||
String key = "memory";
|
||||
if (resourceInformationMap.containsKey(key)) {
|
||||
LOG.warn("Attempt to define resource '" + key +
|
||||
"', but it is not allowed.");
|
||||
throw new YarnRuntimeException("Attempt to re-define mandatory resource '"
|
||||
+ key + "'.");
|
||||
LOG.warn(
|
||||
"Attempt to define resource '" + key + "', but it is not allowed.");
|
||||
throw new YarnRuntimeException(
|
||||
"Attempt to re-define mandatory resource '" + key + "'.");
|
||||
}
|
||||
|
||||
if (resourceInformationMap.containsKey(MEMORY)) {
|
||||
ResourceInformation memInfo = resourceInformationMap.get(MEMORY);
|
||||
String memUnits = ResourceInformation.MEMORY_MB.getUnits();
|
||||
ResourceTypes memType = ResourceInformation.MEMORY_MB.getResourceType();
|
||||
if (!memInfo.getUnits().equals(memUnits) || !memInfo.getResourceType()
|
||||
.equals(memType)) {
|
||||
throw new YarnRuntimeException(
|
||||
"Attempt to re-define mandatory resource 'memory-mb'. It can only"
|
||||
+ " be of type 'COUNTABLE' and have units 'Mi'.");
|
||||
}
|
||||
}
|
||||
for (Map.Entry<String, ResourceInformation> mandatoryResourceEntry :
|
||||
ResourceInformation.MANDATORY_RESOURCES.entrySet()) {
|
||||
key = mandatoryResourceEntry.getKey();
|
||||
ResourceInformation mandatoryRI = mandatoryResourceEntry.getValue();
|
||||
|
||||
if (resourceInformationMap.containsKey(VCORES)) {
|
||||
ResourceInformation vcoreInfo = resourceInformationMap.get(VCORES);
|
||||
String vcoreUnits = ResourceInformation.VCORES.getUnits();
|
||||
ResourceTypes vcoreType = ResourceInformation.VCORES.getResourceType();
|
||||
if (!vcoreInfo.getUnits().equals(vcoreUnits) || !vcoreInfo
|
||||
.getResourceType().equals(vcoreType)) {
|
||||
throw new YarnRuntimeException(
|
||||
"Attempt to re-define mandatory resource 'vcores'. It can only be"
|
||||
+ " of type 'COUNTABLE' and have units ''(no units).");
|
||||
ResourceInformation newDefinedRI = resourceInformationMap.get(key);
|
||||
if (newDefinedRI != null) {
|
||||
String expectedUnit = mandatoryRI.getUnits();
|
||||
ResourceTypes expectedType = mandatoryRI.getResourceType();
|
||||
String actualUnit = newDefinedRI.getUnits();
|
||||
ResourceTypes actualType = newDefinedRI.getResourceType();
|
||||
|
||||
if (!expectedUnit.equals(actualUnit) || !expectedType.equals(
|
||||
actualType)) {
|
||||
throw new YarnRuntimeException("Defined mandatory resource type="
|
||||
+ key + " inside resource-types.xml, however its type or "
|
||||
+ "unit is conflict to mandatory resource types, expected type="
|
||||
+ expectedType + ", unit=" + expectedUnit + "; actual type="
|
||||
+ actualType + " actual unit=" + actualUnit);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3455,6 +3455,45 @@
|
|||
<value>/confstore</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
When yarn.nodemanager.resource.gpu.allowed-gpu-devices=auto specified,
|
||||
YARN NodeManager needs to run GPU discovery binary (now only support
|
||||
nvidia-smi) to get GPU-related information.
|
||||
When value is empty (default), YARN NodeManager will try to locate
|
||||
discovery executable itself.
|
||||
An example of the config value is: /usr/local/bin/nvidia-smi
|
||||
</description>
|
||||
<name>yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables</name>
|
||||
<value></value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Enable additional discovery/isolation of resources on the NodeManager,
|
||||
split by comma. By default, this is empty. Acceptable values: { "yarn-io/gpu" }.
|
||||
</description>
|
||||
<name>yarn.nodemanager.resource-plugins</name>
|
||||
<value></value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>
|
||||
Specify GPU devices which can be managed by YARN NodeManager, split by comma
|
||||
Number of GPU devices will be reported to RM to make scheduling decisions.
|
||||
Set to auto (default) let YARN automatically discover GPU resource from
|
||||
system.
|
||||
Manually specify GPU devices if auto detect GPU device failed or admin
|
||||
only want subset of GPU devices managed by YARN. GPU device is identified
|
||||
by their minor device number. A common approach to get minor device number
|
||||
of GPUs is using "nvidia-smi -q" and search "Minor Number" output. An
|
||||
example of manual specification is "0,1,2,4" to allow YARN NodeManager
|
||||
to manage GPU devices with minor number 0/1/2/4.
|
||||
</description>
|
||||
<name>yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices</name>
|
||||
<value>auto</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>The http address of the timeline reader web application.</description>
|
||||
<name>yarn.timeline-service.reader.webapp.address</name>
|
||||
|
|
|
@ -52,6 +52,23 @@ public class TestResourceUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static void addNewTypesToResources(String... resourceTypes) {
|
||||
// Initialize resource map
|
||||
Map<String, ResourceInformation> riMap = new HashMap<>();
|
||||
|
||||
// Initialize mandatory resources
|
||||
riMap.put(ResourceInformation.MEMORY_URI, ResourceInformation.MEMORY_MB);
|
||||
riMap.put(ResourceInformation.VCORES_URI, ResourceInformation.VCORES);
|
||||
|
||||
for (String newResource : resourceTypes) {
|
||||
riMap.put(newResource, ResourceInformation
|
||||
.newInstance(newResource, "", 0, ResourceTypes.COUNTABLE, 0,
|
||||
Integer.MAX_VALUE));
|
||||
}
|
||||
|
||||
ResourceUtils.initializeResourcesFromResourceInformationMap(riMap);
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
ResourceUtils.resetResourceTypes();
|
||||
|
|
|
@ -112,9 +112,10 @@ public abstract class ContainerExecutor implements Configurable {
|
|||
* Run the executor initialization steps.
|
||||
* Verify that the necessary configs and permissions are in place.
|
||||
*
|
||||
* @param nmContext Context of NM
|
||||
* @throws IOException if initialization fails
|
||||
*/
|
||||
public abstract void init() throws IOException;
|
||||
public abstract void init(Context nmContext) throws IOException;
|
||||
|
||||
/**
|
||||
* This function localizes the JAR file on-demand.
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManag
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
|
||||
|
@ -122,4 +123,6 @@ public interface Context {
|
|||
void setNMTimelinePublisher(NMTimelinePublisher nmMetricsPublisher);
|
||||
|
||||
NMTimelinePublisher getNMTimelinePublisher();
|
||||
|
||||
ResourcePluginManager getResourcePluginManager();
|
||||
}
|
||||
|
|
|
@ -134,7 +134,7 @@ public class DefaultContainerExecutor extends ContainerExecutor {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void init() throws IOException {
|
||||
public void init(Context nmContext) throws IOException {
|
||||
// nothing to do or verify here
|
||||
}
|
||||
|
||||
|
|
|
@ -117,7 +117,7 @@ public class DockerContainerExecutor extends ContainerExecutor {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void init() throws IOException {
|
||||
public void init(Context nmContext) throws IOException {
|
||||
String auth =
|
||||
getConf().get(CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION);
|
||||
if (auth != null && !auth.equals("simple")) {
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager;
|
|||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Optional;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -281,7 +282,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void init() throws IOException {
|
||||
public void init(Context nmContext) throws IOException {
|
||||
Configuration conf = super.getConf();
|
||||
|
||||
// Send command to executor which will just start up,
|
||||
|
@ -305,7 +306,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
|
|||
|
||||
try {
|
||||
resourceHandlerChain = ResourceHandlerModule
|
||||
.getConfiguredResourceHandlerChain(conf);
|
||||
.getConfiguredResourceHandlerChain(conf, nmContext);
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Resource handler chain enabled = " + (resourceHandlerChain
|
||||
!= null));
|
||||
|
@ -845,4 +846,9 @@ public class LinuxContainerExecutor extends ContainerExecutor {
|
|||
e);
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public ResourceHandler getResourceHandler() {
|
||||
return resourceHandlerChain;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,23 +18,7 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.ConcurrentSkipListMap;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
|
||||
import org.apache.hadoop.yarn.state.MultiStateTransitionListener;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
@ -66,12 +50,16 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
|||
import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
|
||||
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
|
||||
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.collectormanager.NMCollectorService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider;
|
||||
|
@ -79,14 +67,25 @@ import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ScriptBasedNodeLabel
|
|||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer;
|
||||
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
|
||||
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
|
||||
import org.apache.hadoop.yarn.state.MultiStateTransitionListener;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.ConcurrentSkipListMap;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
public class NodeManager extends CompositeService
|
||||
implements EventHandler<NodeManagerEvent> {
|
||||
|
@ -333,6 +332,18 @@ public class NodeManager extends CompositeService
|
|||
nmCheckintervalTime, scriptTimeout, scriptArgs);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected ResourcePluginManager createResourcePluginManager() {
|
||||
return new ResourcePluginManager();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected ContainerExecutor createContainerExecutor(Configuration conf) {
|
||||
return ReflectionUtils.newInstance(
|
||||
conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR,
|
||||
DefaultContainerExecutor.class, ContainerExecutor.class), conf);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serviceInit(Configuration conf) throws Exception {
|
||||
UserGroupInformation.setConfiguration(conf);
|
||||
|
@ -362,11 +373,20 @@ public class NodeManager extends CompositeService
|
|||
|
||||
this.aclsManager = new ApplicationACLsManager(conf);
|
||||
|
||||
ContainerExecutor exec = ReflectionUtils.newInstance(
|
||||
conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR,
|
||||
DefaultContainerExecutor.class, ContainerExecutor.class), conf);
|
||||
boolean isDistSchedulingEnabled =
|
||||
conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED,
|
||||
YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED);
|
||||
|
||||
this.context = createNMContext(containerTokenSecretManager,
|
||||
nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf);
|
||||
|
||||
ResourcePluginManager pluginManager = createResourcePluginManager();
|
||||
pluginManager.initialize(context);
|
||||
((NMContext)context).setResourcePluginManager(pluginManager);
|
||||
|
||||
ContainerExecutor exec = createContainerExecutor(conf);
|
||||
try {
|
||||
exec.init();
|
||||
exec.init(context);
|
||||
} catch (IOException e) {
|
||||
throw new YarnRuntimeException("Failed to initialize container executor", e);
|
||||
}
|
||||
|
@ -382,13 +402,6 @@ public class NodeManager extends CompositeService
|
|||
getNodeHealthScriptRunner(conf), dirsHandler);
|
||||
addService(nodeHealthChecker);
|
||||
|
||||
boolean isDistSchedulingEnabled =
|
||||
conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED,
|
||||
YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED);
|
||||
|
||||
this.context = createNMContext(containerTokenSecretManager,
|
||||
nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf);
|
||||
|
||||
|
||||
((NMContext)context).setContainerExecutor(exec);
|
||||
|
||||
|
@ -462,6 +475,12 @@ public class NodeManager extends CompositeService
|
|||
try {
|
||||
super.serviceStop();
|
||||
DefaultMetricsSystem.shutdown();
|
||||
|
||||
// Cleanup ResourcePluginManager
|
||||
ResourcePluginManager rpm = context.getResourcePluginManager();
|
||||
if (rpm != null) {
|
||||
rpm.cleanup();
|
||||
}
|
||||
} finally {
|
||||
// YARN-3641: NM's services stop get failed shouldn't block the
|
||||
// release of NMLevelDBStore.
|
||||
|
@ -609,6 +628,8 @@ public class NodeManager extends CompositeService
|
|||
|
||||
private NMTimelinePublisher nmTimelinePublisher;
|
||||
|
||||
private ResourcePluginManager resourcePluginManager;
|
||||
|
||||
public NMContext(NMContainerTokenSecretManager containerTokenSecretManager,
|
||||
NMTokenSecretManagerInNM nmTokenSecretManager,
|
||||
LocalDirsHandlerService dirsHandler, ApplicationACLsManager aclsManager,
|
||||
|
@ -809,6 +830,15 @@ public class NodeManager extends CompositeService
|
|||
public NMTimelinePublisher getNMTimelinePublisher() {
|
||||
return nmTimelinePublisher;
|
||||
}
|
||||
|
||||
public ResourcePluginManager getResourcePluginManager() {
|
||||
return resourcePluginManager;
|
||||
}
|
||||
|
||||
public void setResourcePluginManager(
|
||||
ResourcePluginManager resourcePluginManager) {
|
||||
this.resourcePluginManager = resourcePluginManager;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -33,6 +33,9 @@ import java.util.Map.Entry;
|
|||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -178,14 +181,15 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
|||
long memoryMb = totalResource.getMemorySize();
|
||||
float vMemToPMem =
|
||||
conf.getFloat(
|
||||
YarnConfiguration.NM_VMEM_PMEM_RATIO,
|
||||
YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
|
||||
YarnConfiguration.NM_VMEM_PMEM_RATIO,
|
||||
YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
|
||||
long virtualMemoryMb = (long)Math.ceil(memoryMb * vMemToPMem);
|
||||
|
||||
int virtualCores = totalResource.getVirtualCores();
|
||||
LOG.info("Nodemanager resources: memory set to " + memoryMb + "MB.");
|
||||
LOG.info("Nodemanager resources: vcores set to " + virtualCores + ".");
|
||||
LOG.info("Nodemanager resources: " + totalResource);
|
||||
|
||||
// Update configured resources via plugins.
|
||||
updateConfiguredResourcesViaPlugins(totalResource);
|
||||
|
||||
LOG.info("Nodemanager resources is set to: " + totalResource);
|
||||
|
||||
metrics.addResource(totalResource);
|
||||
|
||||
|
@ -342,12 +346,27 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
|||
return ServerRMProxy.createRMProxy(conf, ResourceTracker.class);
|
||||
}
|
||||
|
||||
private void updateConfiguredResourcesViaPlugins(
|
||||
Resource configuredResource) throws YarnException {
|
||||
ResourcePluginManager pluginManager = context.getResourcePluginManager();
|
||||
if (pluginManager != null && pluginManager.getNameToPlugins() != null) {
|
||||
// Update configured resource
|
||||
for (ResourcePlugin resourcePlugin : pluginManager.getNameToPlugins()
|
||||
.values()) {
|
||||
if (resourcePlugin.getNodeResourceHandlerInstance() != null) {
|
||||
resourcePlugin.getNodeResourceHandlerInstance()
|
||||
.updateConfiguredResource(configuredResource);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected void registerWithRM()
|
||||
throws YarnException, IOException {
|
||||
RegisterNodeManagerResponse regNMResponse;
|
||||
Set<NodeLabel> nodeLabels = nodeLabelsHandler.getNodeLabelsForRegistration();
|
||||
|
||||
|
||||
// Synchronize NM-RM registration with
|
||||
// ContainerManagerImpl#increaseContainersResource and
|
||||
// ContainerManagerImpl#startContainers to avoid race condition
|
||||
|
@ -358,6 +377,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
|||
RegisterNodeManagerRequest.newInstance(nodeId, httpPort, totalResource,
|
||||
nodeManagerVersionId, containerReports, getRunningApplications(),
|
||||
nodeLabels, physicalResource);
|
||||
|
||||
if (containerReports != null) {
|
||||
LOG.info("Registering with RM using containers :" + containerReports);
|
||||
}
|
||||
|
@ -406,7 +426,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
|||
if (masterKey != null) {
|
||||
this.context.getContainerTokenSecretManager().setMasterKey(masterKey);
|
||||
}
|
||||
|
||||
|
||||
masterKey = regNMResponse.getNMTokenMasterKey();
|
||||
if (masterKey != null) {
|
||||
this.context.getNMTokenSecretManager().setMasterKey(masterKey);
|
||||
|
@ -733,7 +753,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public long getRMIdentifier() {
|
||||
return this.rmIdentifier;
|
||||
|
|
|
@ -51,6 +51,7 @@ public class PrivilegedOperation {
|
|||
TC_READ_STATS("--tc-read-stats"),
|
||||
ADD_PID_TO_CGROUP(""), //no CLI switch supported yet.
|
||||
RUN_DOCKER_CMD("--run-docker"),
|
||||
GPU("--module-gpu"),
|
||||
LIST_AS_USER(""); //no CLI switch supported yet.
|
||||
|
||||
private final String option;
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -135,7 +136,8 @@ public class ResourceHandlerChain implements ResourceHandler {
|
|||
return allOperations;
|
||||
}
|
||||
|
||||
List<ResourceHandler> getResourceHandlerList() {
|
||||
@VisibleForTesting
|
||||
public List<ResourceHandler> getResourceHandlerList() {
|
||||
return Collections.unmodifiableList(resourceHandlers);
|
||||
}
|
||||
|
||||
|
|
|
@ -21,25 +21,28 @@
|
|||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.util.CgroupsLCEResourcesHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Provides mechanisms to get various resource handlers - cpu, memory, network,
|
||||
|
@ -206,22 +209,41 @@ public class ResourceHandlerModule {
|
|||
}
|
||||
|
||||
private static void initializeConfiguredResourceHandlerChain(
|
||||
Configuration conf) throws ResourceHandlerException {
|
||||
Configuration conf, Context nmContext)
|
||||
throws ResourceHandlerException {
|
||||
ArrayList<ResourceHandler> handlerList = new ArrayList<>();
|
||||
|
||||
addHandlerIfNotNull(handlerList, getOutboundBandwidthResourceHandler(conf));
|
||||
addHandlerIfNotNull(handlerList, getDiskResourceHandler(conf));
|
||||
addHandlerIfNotNull(handlerList, getMemoryResourceHandler(conf));
|
||||
addHandlerIfNotNull(handlerList, getCGroupsCpuResourceHandler(conf));
|
||||
addHandlersFromConfiguredResourcePlugins(handlerList, conf, nmContext);
|
||||
resourceHandlerChain = new ResourceHandlerChain(handlerList);
|
||||
}
|
||||
|
||||
private static void addHandlersFromConfiguredResourcePlugins(
|
||||
List<ResourceHandler> handlerList, Configuration conf,
|
||||
Context nmContext) throws ResourceHandlerException {
|
||||
ResourcePluginManager pluginManager = nmContext.getResourcePluginManager();
|
||||
if (pluginManager != null) {
|
||||
Map<String, ResourcePlugin> pluginMap = pluginManager.getNameToPlugins();
|
||||
if (pluginMap != null) {
|
||||
for (ResourcePlugin plugin : pluginMap.values()) {
|
||||
addHandlerIfNotNull(handlerList, plugin
|
||||
.createResourceHandler(nmContext,
|
||||
getInitializedCGroupsHandler(conf),
|
||||
PrivilegedOperationExecutor.getInstance(conf)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static ResourceHandlerChain getConfiguredResourceHandlerChain(
|
||||
Configuration conf) throws ResourceHandlerException {
|
||||
Configuration conf, Context nmContext) throws ResourceHandlerException {
|
||||
if (resourceHandlerChain == null) {
|
||||
synchronized (ResourceHandlerModule.class) {
|
||||
if (resourceHandlerChain == null) {
|
||||
initializeConfiguredResourceHandlerChain(conf);
|
||||
initializeConfiguredResourceHandlerChain(conf, nmContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,242 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import com.google.common.collect.Sets;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
|
||||
|
||||
/**
|
||||
* Allocate GPU resources according to requirements
|
||||
*/
|
||||
public class GpuResourceAllocator {
|
||||
final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class);
|
||||
|
||||
private Set<Integer> allowedGpuDevices = new TreeSet<>();
|
||||
private Map<Integer, ContainerId> usedDevices = new TreeMap<>();
|
||||
private Context nmContext;
|
||||
|
||||
public GpuResourceAllocator(Context ctx) {
|
||||
this.nmContext = ctx;
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains allowed and denied devices with minor number.
|
||||
* Denied devices will be useful for cgroups devices module to do blacklisting
|
||||
*/
|
||||
static class GpuAllocation {
|
||||
private Set<Integer> allowed = Collections.emptySet();
|
||||
private Set<Integer> denied = Collections.emptySet();
|
||||
|
||||
GpuAllocation(Set<Integer> allowed, Set<Integer> denied) {
|
||||
if (allowed != null) {
|
||||
this.allowed = ImmutableSet.copyOf(allowed);
|
||||
}
|
||||
if (denied != null) {
|
||||
this.denied = ImmutableSet.copyOf(denied);
|
||||
}
|
||||
}
|
||||
|
||||
public Set<Integer> getAllowedGPUs() {
|
||||
return allowed;
|
||||
}
|
||||
|
||||
public Set<Integer> getDeniedGPUs() {
|
||||
return denied;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add GPU to allowed list
|
||||
* @param minorNumber minor number of the GPU device.
|
||||
*/
|
||||
public synchronized void addGpu(int minorNumber) {
|
||||
allowedGpuDevices.add(minorNumber);
|
||||
}
|
||||
|
||||
private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices,
|
||||
ContainerId containerId) {
|
||||
return "Failed to find enough GPUs, requestor=" + containerId
|
||||
+ ", #RequestedGPUs=" + numRequestedGpuDevices + ", #availableGpus="
|
||||
+ getAvailableGpus();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public synchronized int getAvailableGpus() {
|
||||
return allowedGpuDevices.size() - usedDevices.size();
|
||||
}
|
||||
|
||||
public synchronized void recoverAssignedGpus(ContainerId containerId)
|
||||
throws ResourceHandlerException {
|
||||
Container c = nmContext.getContainers().get(containerId);
|
||||
if (null == c) {
|
||||
throw new ResourceHandlerException(
|
||||
"This shouldn't happen, cannot find container with id="
|
||||
+ containerId);
|
||||
}
|
||||
|
||||
for (Serializable deviceId : c.getResourceMappings().getAssignedResources(
|
||||
GPU_URI)){
|
||||
if (!(deviceId instanceof String)) {
|
||||
throw new ResourceHandlerException(
|
||||
"Trying to recover device id, however it"
|
||||
+ " is not String, this shouldn't happen");
|
||||
}
|
||||
|
||||
|
||||
int devId;
|
||||
try {
|
||||
devId = Integer.parseInt((String)deviceId);
|
||||
} catch (NumberFormatException e) {
|
||||
throw new ResourceHandlerException("Failed to recover device id because"
|
||||
+ "it is not a valid integer, devId:" + deviceId);
|
||||
}
|
||||
|
||||
// Make sure it is in allowed GPU device.
|
||||
if (!allowedGpuDevices.contains(devId)) {
|
||||
throw new ResourceHandlerException("Try to recover device id = " + devId
|
||||
+ " however it is not in allowed device list:" + StringUtils
|
||||
.join(",", allowedGpuDevices));
|
||||
}
|
||||
|
||||
// Make sure it is not occupied by anybody else
|
||||
if (usedDevices.containsKey(devId)) {
|
||||
throw new ResourceHandlerException("Try to recover device id = " + devId
|
||||
+ " however it is already assigned to container=" + usedDevices
|
||||
.get(devId) + ", please double check what happened.");
|
||||
}
|
||||
|
||||
usedDevices.put(devId, containerId);
|
||||
}
|
||||
}
|
||||
|
||||
private int getRequestedGpus(Resource requestedResource) {
|
||||
try {
|
||||
return Long.valueOf(requestedResource.getResourceValue(
|
||||
GPU_URI)).intValue();
|
||||
} catch (ResourceNotFoundException e) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Assign GPU to requestor
|
||||
* @param container container to allocate
|
||||
* @return List of denied Gpus with minor numbers
|
||||
* @throws ResourceHandlerException When failed to
|
||||
*/
|
||||
public synchronized GpuAllocation assignGpus(Container container)
|
||||
throws ResourceHandlerException {
|
||||
Resource requestedResource = container.getResource();
|
||||
ContainerId containerId = container.getContainerId();
|
||||
int numRequestedGpuDevices = getRequestedGpus(requestedResource);
|
||||
// Assign Gpus to container if requested some.
|
||||
if (numRequestedGpuDevices > 0) {
|
||||
if (numRequestedGpuDevices > getAvailableGpus()) {
|
||||
throw new ResourceHandlerException(
|
||||
getResourceHandlerExceptionMessage(numRequestedGpuDevices,
|
||||
containerId));
|
||||
}
|
||||
|
||||
Set<Integer> assignedGpus = new HashSet<>();
|
||||
|
||||
for (int deviceNum : allowedGpuDevices) {
|
||||
if (!usedDevices.containsKey(deviceNum)) {
|
||||
usedDevices.put(deviceNum, containerId);
|
||||
assignedGpus.add(deviceNum);
|
||||
if (assignedGpus.size() == numRequestedGpuDevices) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Record in state store if we allocated anything
|
||||
if (!assignedGpus.isEmpty()) {
|
||||
List<Serializable> allocatedDevices = new ArrayList<>();
|
||||
for (int gpu : assignedGpus) {
|
||||
allocatedDevices.add(String.valueOf(gpu));
|
||||
}
|
||||
try {
|
||||
// Update Container#getResourceMapping.
|
||||
ResourceMappings.AssignedResources assignedResources =
|
||||
new ResourceMappings.AssignedResources();
|
||||
assignedResources.updateAssignedResources(allocatedDevices);
|
||||
container.getResourceMappings().addAssignedResources(GPU_URI,
|
||||
assignedResources);
|
||||
|
||||
// Update state store.
|
||||
nmContext.getNMStateStore().storeAssignedResources(containerId,
|
||||
GPU_URI, allocatedDevices);
|
||||
} catch (IOException e) {
|
||||
cleanupAssignGpus(containerId);
|
||||
throw new ResourceHandlerException(e);
|
||||
}
|
||||
}
|
||||
|
||||
return new GpuAllocation(assignedGpus,
|
||||
Sets.difference(allowedGpuDevices, assignedGpus));
|
||||
}
|
||||
return new GpuAllocation(null, allowedGpuDevices);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up all Gpus assigned to containerId
|
||||
* @param containerId containerId
|
||||
*/
|
||||
public synchronized void cleanupAssignGpus(ContainerId containerId) {
|
||||
Iterator<Map.Entry<Integer, ContainerId>> iter =
|
||||
usedDevices.entrySet().iterator();
|
||||
while (iter.hasNext()) {
|
||||
if (iter.next().getValue().equals(containerId)) {
|
||||
iter.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public synchronized Map<Integer, ContainerId> getDeviceAllocationMapping() {
|
||||
return new HashMap<>(usedDevices);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,153 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class GpuResourceHandlerImpl implements ResourceHandler {
|
||||
final static Log LOG = LogFactory
|
||||
.getLog(GpuResourceHandlerImpl.class);
|
||||
|
||||
// This will be used by container-executor to add necessary clis
|
||||
public static final String EXCLUDED_GPUS_CLI_OPTION = "--excluded_gpus";
|
||||
public static final String CONTAINER_ID_CLI_OPTION = "--container_id";
|
||||
|
||||
private GpuResourceAllocator gpuAllocator;
|
||||
private CGroupsHandler cGroupsHandler;
|
||||
private PrivilegedOperationExecutor privilegedOperationExecutor;
|
||||
|
||||
public GpuResourceHandlerImpl(Context nmContext,
|
||||
CGroupsHandler cGroupsHandler,
|
||||
PrivilegedOperationExecutor privilegedOperationExecutor) {
|
||||
this.cGroupsHandler = cGroupsHandler;
|
||||
this.privilegedOperationExecutor = privilegedOperationExecutor;
|
||||
gpuAllocator = new GpuResourceAllocator(nmContext);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PrivilegedOperation> bootstrap(Configuration configuration)
|
||||
throws ResourceHandlerException {
|
||||
List<Integer> minorNumbersOfUsableGpus;
|
||||
try {
|
||||
minorNumbersOfUsableGpus = GpuDiscoverer.getInstance()
|
||||
.getMinorNumbersOfGpusUsableByYarn();
|
||||
} catch (YarnException e) {
|
||||
LOG.error("Exception when trying to get usable GPU device", e);
|
||||
throw new ResourceHandlerException(e);
|
||||
}
|
||||
|
||||
for (int minorNumber : minorNumbersOfUsableGpus) {
|
||||
gpuAllocator.addGpu(minorNumber);
|
||||
}
|
||||
|
||||
// And initialize cgroups
|
||||
this.cGroupsHandler.initializeCGroupController(
|
||||
CGroupsHandler.CGroupController.DEVICES);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized List<PrivilegedOperation> preStart(Container container)
|
||||
throws ResourceHandlerException {
|
||||
String containerIdStr = container.getContainerId().toString();
|
||||
|
||||
// Assign Gpus to container if requested some.
|
||||
GpuResourceAllocator.GpuAllocation allocation = gpuAllocator.assignGpus(
|
||||
container);
|
||||
|
||||
// Create device cgroups for the container
|
||||
cGroupsHandler.createCGroup(CGroupsHandler.CGroupController.DEVICES,
|
||||
containerIdStr);
|
||||
try {
|
||||
// Execute c-e to setup GPU isolation before launch the container
|
||||
PrivilegedOperation privilegedOperation = new PrivilegedOperation(
|
||||
PrivilegedOperation.OperationType.GPU, Arrays
|
||||
.asList(CONTAINER_ID_CLI_OPTION, containerIdStr));
|
||||
if (!allocation.getDeniedGPUs().isEmpty()) {
|
||||
privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION,
|
||||
StringUtils.join(",", allocation.getDeniedGPUs())));
|
||||
}
|
||||
|
||||
privilegedOperationExecutor.executePrivilegedOperation(
|
||||
privilegedOperation, true);
|
||||
} catch (PrivilegedOperationException e) {
|
||||
cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES,
|
||||
containerIdStr);
|
||||
LOG.warn("Could not update cgroup for container", e);
|
||||
throw new ResourceHandlerException(e);
|
||||
}
|
||||
|
||||
List<PrivilegedOperation> ret = new ArrayList<>();
|
||||
ret.add(new PrivilegedOperation(
|
||||
PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP,
|
||||
PrivilegedOperation.CGROUP_ARG_PREFIX
|
||||
+ cGroupsHandler.getPathForCGroupTasks(
|
||||
CGroupsHandler.CGroupController.DEVICES, containerIdStr)));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public GpuResourceAllocator getGpuAllocator() {
|
||||
return gpuAllocator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PrivilegedOperation> reacquireContainer(ContainerId containerId)
|
||||
throws ResourceHandlerException {
|
||||
gpuAllocator.recoverAssignedGpus(containerId);
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized List<PrivilegedOperation> postComplete(
|
||||
ContainerId containerId) throws ResourceHandlerException {
|
||||
gpuAllocator.cleanupAssignGpus(containerId);
|
||||
cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES,
|
||||
containerId.toString());
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PrivilegedOperation> teardown() throws ResourceHandlerException {
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
|
||||
/**
|
||||
* Plugins to handle resources on a node. This will be used by
|
||||
* {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater}
|
||||
*/
|
||||
public abstract class NodeResourceUpdaterPlugin {
|
||||
/**
|
||||
* Update configured resource for the given component.
|
||||
* @param res resource passed in by external mododule (such as
|
||||
* {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater}
|
||||
* @throws YarnException when any issue happens.
|
||||
*/
|
||||
public abstract void updateConfiguredResource(Resource res)
|
||||
throws YarnException;
|
||||
|
||||
/**
|
||||
* This method will be called when the node's resource is loaded from
|
||||
* dynamic-resources.xml in ResourceManager.
|
||||
*
|
||||
* @param newResource newResource reported by RM
|
||||
* @throws YarnException when any mismatch between NM/RM
|
||||
*/
|
||||
public void handleUpdatedResourceFromRM(Resource newResource) throws
|
||||
YarnException {
|
||||
// by default do nothing, subclass should implement this method when any
|
||||
// special activities required upon new resource reported by RM.
|
||||
}
|
||||
|
||||
// TODO: add implementation to update node attribute once YARN-3409 merged.
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
|
||||
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
|
||||
|
||||
/**
|
||||
* {@link ResourcePlugin} is an interface for node manager to easier support
|
||||
* discovery/manage/isolation for new resource types.
|
||||
*
|
||||
* <p>
|
||||
* It has two major part: {@link ResourcePlugin#createResourceHandler(Context,
|
||||
* CGroupsHandler, PrivilegedOperationExecutor)} and
|
||||
* {@link ResourcePlugin#getNodeResourceHandlerInstance()}, see javadocs below
|
||||
* for more details.
|
||||
* </p>
|
||||
*/
|
||||
public interface ResourcePlugin {
|
||||
/**
|
||||
* Initialize the plugin, this will be invoked during NM startup.
|
||||
* @param context NM Context
|
||||
* @throws YarnException when any issue occurs
|
||||
*/
|
||||
void initialize(Context context) throws YarnException;
|
||||
|
||||
/**
|
||||
* Plugin needs to return {@link ResourceHandler} when any special isolation
|
||||
* required for the resource type. This will be added to
|
||||
* {@link ResourceHandlerChain} during NodeManager startup. When no special
|
||||
* isolation need, return null.
|
||||
*
|
||||
* @param nmContext NodeManager context.
|
||||
* @param cGroupsHandler CGroupsHandler
|
||||
* @param privilegedOperationExecutor Privileged Operation Executor.
|
||||
* @return ResourceHandler
|
||||
*/
|
||||
ResourceHandler createResourceHandler(Context nmContext,
|
||||
CGroupsHandler cGroupsHandler,
|
||||
PrivilegedOperationExecutor privilegedOperationExecutor);
|
||||
|
||||
/**
|
||||
* Plugin needs to return {@link NodeResourceUpdaterPlugin} when any discovery
|
||||
* mechanism required for the resource type. For example, if we want to set
|
||||
* resource-value during NM registration or send update during NM-RM heartbeat
|
||||
* We can implement a {@link NodeResourceUpdaterPlugin} and update fields of
|
||||
* {@link org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest}
|
||||
* or {@link org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest}
|
||||
*
|
||||
* This will be invoked during every node status update or node registration,
|
||||
* please avoid creating new instance every time.
|
||||
*
|
||||
* @return NodeResourceUpdaterPlugin, could be null when no discovery needed.
|
||||
*/
|
||||
NodeResourceUpdaterPlugin getNodeResourceHandlerInstance();
|
||||
|
||||
/**
|
||||
* Do cleanup of the plugin, this will be invoked when
|
||||
* {@link org.apache.hadoop.yarn.server.nodemanager.NodeManager} stops
|
||||
* @throws YarnException if any issue occurs
|
||||
*/
|
||||
void cleanup() throws YarnException;
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
|
||||
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
|
||||
|
||||
/**
|
||||
* Manages {@link ResourcePlugin} configured on this NodeManager.
|
||||
*/
|
||||
public class ResourcePluginManager {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(ResourcePluginManager.class);
|
||||
private static final Set<String> SUPPORTED_RESOURCE_PLUGINS = ImmutableSet.of(
|
||||
GPU_URI);
|
||||
|
||||
private Map<String, ResourcePlugin> configuredPlugins = Collections.EMPTY_MAP;
|
||||
|
||||
public synchronized void initialize(Context context)
|
||||
throws YarnException {
|
||||
Configuration conf = context.getConf();
|
||||
String[] plugins = conf.getStrings(YarnConfiguration.NM_RESOURCE_PLUGINS);
|
||||
|
||||
if (plugins != null) {
|
||||
Map<String, ResourcePlugin> pluginMap = new HashMap<>();
|
||||
|
||||
// Initialize each plugins
|
||||
for (String resourceName : plugins) {
|
||||
resourceName = resourceName.trim();
|
||||
if (!SUPPORTED_RESOURCE_PLUGINS.contains(resourceName)) {
|
||||
String msg =
|
||||
"Trying to initialize resource plugin with name=" + resourceName
|
||||
+ ", it is not supported, list of supported plugins:"
|
||||
+ StringUtils.join(",",
|
||||
SUPPORTED_RESOURCE_PLUGINS);
|
||||
LOG.error(msg);
|
||||
throw new YarnException(msg);
|
||||
}
|
||||
|
||||
if (pluginMap.containsKey(resourceName)) {
|
||||
// Duplicated items, ignore ...
|
||||
continue;
|
||||
}
|
||||
|
||||
ResourcePlugin plugin = null;
|
||||
if (resourceName.equals(GPU_URI)) {
|
||||
plugin = new GpuResourcePlugin();
|
||||
}
|
||||
|
||||
if (plugin == null) {
|
||||
throw new YarnException(
|
||||
"This shouldn't happen, plugin=" + resourceName
|
||||
+ " should be loaded and initialized");
|
||||
}
|
||||
plugin.initialize(context);
|
||||
pluginMap.put(resourceName, plugin);
|
||||
}
|
||||
|
||||
configuredPlugins = Collections.unmodifiableMap(pluginMap);
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void cleanup() throws YarnException {
|
||||
for (ResourcePlugin plugin : configuredPlugins.values()) {
|
||||
plugin.cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get resource name (such as gpu/fpga) to plugin references.
|
||||
* @return read-only map of resource name to plugins.
|
||||
*/
|
||||
public synchronized Map<String, ResourcePlugin> getNameToPlugins() {
|
||||
return configuredPlugins;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,254 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.util.Shell;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
public class GpuDiscoverer {
|
||||
public static final Logger LOG = LoggerFactory.getLogger(
|
||||
GpuDiscoverer.class);
|
||||
@VisibleForTesting
|
||||
protected static final String DEFAULT_BINARY_NAME = "nvidia-smi";
|
||||
|
||||
// When executable path not set, try to search default dirs
|
||||
// By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when
|
||||
// launched by nvidia-docker.
|
||||
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
|
||||
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
|
||||
|
||||
// command should not run more than 10 sec.
|
||||
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
|
||||
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
|
||||
private static GpuDiscoverer instance;
|
||||
|
||||
static {
|
||||
instance = new GpuDiscoverer();
|
||||
}
|
||||
|
||||
private Configuration conf = null;
|
||||
private String pathOfGpuBinary = null;
|
||||
private Map<String, String> environment = new HashMap<>();
|
||||
private GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
|
||||
|
||||
private int numOfErrorExecutionSinceLastSucceed = 0;
|
||||
GpuDeviceInformation lastDiscoveredGpuInformation = null;
|
||||
|
||||
private void validateConfOrThrowException() throws YarnException {
|
||||
if (conf == null) {
|
||||
throw new YarnException("Please initialize (call initialize) before use "
|
||||
+ GpuDiscoverer.class.getSimpleName());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get GPU device information from system.
|
||||
* This need to be called after initialize.
|
||||
*
|
||||
* Please note that this only works on *NIX platform, so external caller
|
||||
* need to make sure this.
|
||||
*
|
||||
* @return GpuDeviceInformation
|
||||
* @throws YarnException when any error happens
|
||||
*/
|
||||
public synchronized GpuDeviceInformation getGpuDeviceInformation()
|
||||
throws YarnException {
|
||||
validateConfOrThrowException();
|
||||
|
||||
if (null == pathOfGpuBinary) {
|
||||
throw new YarnException(
|
||||
"Failed to find GPU discovery executable, please double check "
|
||||
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
|
||||
}
|
||||
|
||||
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
|
||||
String msg =
|
||||
"Failed to execute GPU device information detection script for "
|
||||
+ MAX_REPEATED_ERROR_ALLOWED
|
||||
+ " times, skip following executions.";
|
||||
LOG.error(msg);
|
||||
throw new YarnException(msg);
|
||||
}
|
||||
|
||||
String output;
|
||||
try {
|
||||
output = Shell.execCommand(environment,
|
||||
new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
|
||||
GpuDeviceInformation info = parser.parseXml(output);
|
||||
numOfErrorExecutionSinceLastSucceed = 0;
|
||||
lastDiscoveredGpuInformation = info;
|
||||
return info;
|
||||
} catch (IOException e) {
|
||||
numOfErrorExecutionSinceLastSucceed++;
|
||||
String msg =
|
||||
"Failed to execute " + pathOfGpuBinary + " exception message:" + e
|
||||
.getMessage() + ", continue ...";
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(msg);
|
||||
}
|
||||
throw new YarnException(e);
|
||||
} catch (YarnException e) {
|
||||
numOfErrorExecutionSinceLastSucceed++;
|
||||
String msg = "Failed to parse xml output" + e.getMessage();
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.warn(msg, e);
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of minor device numbers of Gpu devices usable by YARN.
|
||||
*
|
||||
* @return List of minor device numbers of Gpu devices.
|
||||
* @throws YarnException when any issue happens
|
||||
*/
|
||||
public synchronized List<Integer> getMinorNumbersOfGpusUsableByYarn()
|
||||
throws YarnException {
|
||||
validateConfOrThrowException();
|
||||
|
||||
String allowedDevicesStr = conf.get(
|
||||
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||
|
||||
List<Integer> minorNumbers = new ArrayList<>();
|
||||
|
||||
if (allowedDevicesStr.equals(
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
|
||||
// Get gpu device information from system.
|
||||
if (null == lastDiscoveredGpuInformation) {
|
||||
String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to "
|
||||
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES
|
||||
+ ", however automatically discovering "
|
||||
+ "GPU information failed, please check NodeManager log for more"
|
||||
+ " details, as an alternative, admin can specify "
|
||||
+ YarnConfiguration.NM_GPU_ALLOWED_DEVICES
|
||||
+ " manually to enable GPU isolation.";
|
||||
LOG.error(msg);
|
||||
throw new YarnException(msg);
|
||||
}
|
||||
|
||||
if (lastDiscoveredGpuInformation.getGpus() != null) {
|
||||
for (PerGpuDeviceInformation gpu : lastDiscoveredGpuInformation
|
||||
.getGpus()) {
|
||||
minorNumbers.add(gpu.getMinorNumber());
|
||||
}
|
||||
}
|
||||
} else{
|
||||
for (String s : allowedDevicesStr.split(",")) {
|
||||
if (s.trim().length() > 0) {
|
||||
minorNumbers.add(Integer.valueOf(s.trim()));
|
||||
}
|
||||
}
|
||||
LOG.info("Allowed GPU devices with minor numbers:" + allowedDevicesStr);
|
||||
}
|
||||
|
||||
return minorNumbers;
|
||||
}
|
||||
|
||||
public synchronized void initialize(Configuration conf) throws YarnException {
|
||||
this.conf = conf;
|
||||
numOfErrorExecutionSinceLastSucceed = 0;
|
||||
String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
|
||||
YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC);
|
||||
if (pathToExecutable.isEmpty()) {
|
||||
pathToExecutable = DEFAULT_BINARY_NAME;
|
||||
}
|
||||
|
||||
// Validate file existence
|
||||
File binaryPath = new File(pathToExecutable);
|
||||
|
||||
if (!binaryPath.exists()) {
|
||||
// When binary not exist, use default setting.
|
||||
boolean found = false;
|
||||
for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
|
||||
binaryPath = new File(dir, DEFAULT_BINARY_NAME);
|
||||
if (binaryPath.exists()) {
|
||||
found = true;
|
||||
pathOfGpuBinary = binaryPath.getAbsolutePath();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath()
|
||||
+ ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC
|
||||
+ "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME);
|
||||
}
|
||||
} else{
|
||||
// If path specified by user is a directory, use
|
||||
if (binaryPath.isDirectory()) {
|
||||
binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME);
|
||||
LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
|
||||
+ " under the directory, updated path-to-executable:" + binaryPath
|
||||
.getAbsolutePath());
|
||||
}
|
||||
// Validated
|
||||
pathOfGpuBinary = binaryPath.getAbsolutePath();
|
||||
}
|
||||
|
||||
// Try to discover GPU information once and print
|
||||
try {
|
||||
LOG.info("Trying to discover GPU information ...");
|
||||
GpuDeviceInformation info = getGpuDeviceInformation();
|
||||
LOG.info(info.toString());
|
||||
} catch (YarnException e) {
|
||||
String msg =
|
||||
"Failed to discover GPU information from system, exception message:"
|
||||
+ e.getMessage() + " continue...";
|
||||
LOG.warn(msg);
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected Map<String, String> getEnvironmentToRunCommand() {
|
||||
return environment;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected String getPathOfGpuBinary() {
|
||||
return pathOfGpuBinary;
|
||||
}
|
||||
|
||||
public static GpuDiscoverer getInstance() {
|
||||
return instance;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
|
||||
|
||||
public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
|
||||
|
||||
@Override
|
||||
public void updateConfiguredResource(Resource res) throws YarnException {
|
||||
LOG.info("Initializing configured GPU resources for the NodeManager.");
|
||||
|
||||
List<Integer> usableGpus =
|
||||
GpuDiscoverer.getInstance().getMinorNumbersOfGpusUsableByYarn();
|
||||
if (null == usableGpus || usableGpus.isEmpty()) {
|
||||
LOG.info("Didn't find any usable GPUs on the NodeManager.");
|
||||
// No gpu can be used by YARN.
|
||||
return;
|
||||
}
|
||||
|
||||
long nUsableGpus = usableGpus.size();
|
||||
|
||||
Map<String, ResourceInformation> configuredResourceTypes =
|
||||
ResourceUtils.getResourceTypes();
|
||||
if (!configuredResourceTypes.containsKey(GPU_URI)) {
|
||||
throw new YarnException("Found " + nUsableGpus + " usable GPUs, however "
|
||||
+ GPU_URI
|
||||
+ " resource-type is not configured inside"
|
||||
+ " resource-types.xml, please configure it to enable GPU feature or"
|
||||
+ " remove " + GPU_URI + " from "
|
||||
+ YarnConfiguration.NM_RESOURCE_PLUGINS);
|
||||
}
|
||||
|
||||
res.setResourceValue(GPU_URI, nUsableGpus);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
|
||||
|
||||
public class GpuResourcePlugin implements ResourcePlugin {
|
||||
private ResourceHandler gpuResourceHandler = null;
|
||||
private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null;
|
||||
|
||||
@Override
|
||||
public synchronized void initialize(Context context) throws YarnException {
|
||||
resourceDiscoverHandler = new GpuNodeResourceUpdateHandler();
|
||||
GpuDiscoverer.getInstance().initialize(context.getConf());
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized ResourceHandler createResourceHandler(
|
||||
Context context, CGroupsHandler cGroupsHandler,
|
||||
PrivilegedOperationExecutor privilegedOperationExecutor) {
|
||||
if (gpuResourceHandler == null) {
|
||||
gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
|
||||
privilegedOperationExecutor);
|
||||
}
|
||||
|
||||
return gpuResourceHandler;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized NodeResourceUpdaterPlugin getNodeResourceHandlerInstance() {
|
||||
return resourceDiscoverHandler;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cleanup() throws YarnException {
|
||||
// Do nothing.
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
|
||||
import javax.xml.bind.annotation.XmlRootElement;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* All GPU Device Information in the system.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
@XmlRootElement(name = "nvidia_smi_log")
|
||||
public class GpuDeviceInformation {
|
||||
List<PerGpuDeviceInformation> gpus;
|
||||
|
||||
String driverVersion = "N/A";
|
||||
|
||||
// More fields like topology information could be added when needed.
|
||||
// ...
|
||||
|
||||
@javax.xml.bind.annotation.XmlElement(name = "gpu")
|
||||
public List<PerGpuDeviceInformation> getGpus() {
|
||||
return gpus;
|
||||
}
|
||||
|
||||
public void setGpus(List<PerGpuDeviceInformation> gpus) {
|
||||
this.gpus = gpus;
|
||||
}
|
||||
|
||||
@javax.xml.bind.annotation.XmlElement(name = "driver_version")
|
||||
public String getDriverVersion() {
|
||||
return driverVersion;
|
||||
}
|
||||
|
||||
public void setDriverVersion(String driverVersion) {
|
||||
this.driverVersion = driverVersion;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("=== Gpus in the system ===\n").append("\tDriver Version:").append(
|
||||
getDriverVersion()).append("\n");
|
||||
|
||||
if (gpus != null) {
|
||||
for (PerGpuDeviceInformation gpu : gpus) {
|
||||
sb.append("\t").append(gpu.toString()).append("\n");
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* <p>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.XMLReader;
|
||||
|
||||
import javax.xml.bind.JAXBContext;
|
||||
import javax.xml.bind.JAXBException;
|
||||
import javax.xml.bind.Unmarshaller;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
import javax.xml.transform.sax.SAXSource;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
* Parse XML and get GPU device information
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
public class GpuDeviceInformationParser {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(
|
||||
GpuDeviceInformationParser.class);
|
||||
|
||||
private Unmarshaller unmarshaller = null;
|
||||
private XMLReader xmlReader = null;
|
||||
|
||||
private void init()
|
||||
throws SAXException, ParserConfigurationException, JAXBException {
|
||||
SAXParserFactory spf = SAXParserFactory.newInstance();
|
||||
// Disable external-dtd since by default nvidia-smi output contains
|
||||
// <!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v8.dtd"> in header
|
||||
spf.setFeature(
|
||||
"http://apache.org/xml/features/nonvalidating/load-external-dtd",
|
||||
false);
|
||||
spf.setFeature("http://xml.org/sax/features/validation", false);
|
||||
|
||||
JAXBContext jaxbContext = JAXBContext.newInstance(
|
||||
GpuDeviceInformation.class);
|
||||
|
||||
this.xmlReader = spf.newSAXParser().getXMLReader();
|
||||
this.unmarshaller = jaxbContext.createUnmarshaller();
|
||||
}
|
||||
|
||||
public synchronized GpuDeviceInformation parseXml(String xmlContent)
|
||||
throws YarnException {
|
||||
if (unmarshaller == null) {
|
||||
try {
|
||||
init();
|
||||
} catch (SAXException | ParserConfigurationException | JAXBException e) {
|
||||
LOG.error("Exception while initialize parser", e);
|
||||
throw new YarnException(e);
|
||||
}
|
||||
}
|
||||
|
||||
InputSource inputSource = new InputSource(new StringReader(xmlContent));
|
||||
SAXSource source = new SAXSource(xmlReader, inputSource);
|
||||
try {
|
||||
return (GpuDeviceInformation) unmarshaller.unmarshal(source);
|
||||
} catch (JAXBException e) {
|
||||
LOG.error("Exception while parsing xml", e);
|
||||
throw new YarnException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
|
||||
import javax.xml.bind.annotation.XmlElement;
|
||||
import javax.xml.bind.annotation.XmlRootElement;
|
||||
import javax.xml.bind.annotation.adapters.XmlAdapter;
|
||||
|
||||
/**
|
||||
* Capture single GPU device information such as memory size, temperature,
|
||||
* utilization.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
@XmlRootElement(name = "gpu")
|
||||
public class PerGpuDeviceInformation {
|
||||
|
||||
private String productName = "N/A";
|
||||
private String uuid = "N/A";
|
||||
private int minorNumber = -1;
|
||||
|
||||
private PerGpuUtilizations gpuUtilizations;
|
||||
private PerGpuMemoryUsage gpuMemoryUsage;
|
||||
private PerGpuTemperature temperature;
|
||||
|
||||
/**
|
||||
* Convert formats like "34 C", "75.6 %" to float.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
static class StrToFloatBeforeSpaceAdapter extends
|
||||
XmlAdapter<String, Float> {
|
||||
@Override
|
||||
public String marshal(Float v) throws Exception {
|
||||
if (v == null) {
|
||||
return "";
|
||||
}
|
||||
return String.valueOf(v);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Float unmarshal(String v) throws Exception {
|
||||
if (v == null) {
|
||||
return -1f;
|
||||
}
|
||||
|
||||
return Float.valueOf(v.split(" ")[0]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert formats like "725 MiB" to long.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
static class StrToMemAdapter extends XmlAdapter<String, Long> {
|
||||
@Override
|
||||
public String marshal(Long v) throws Exception {
|
||||
if (v == null) {
|
||||
return "";
|
||||
}
|
||||
return String.valueOf(v) + " MiB";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long unmarshal(String v) throws Exception {
|
||||
if (v == null) {
|
||||
return -1L;
|
||||
}
|
||||
return Long.valueOf(v.split(" ")[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@XmlElement(name = "temperature")
|
||||
public PerGpuTemperature getTemperature() {
|
||||
return temperature;
|
||||
}
|
||||
|
||||
public void setTemperature(PerGpuTemperature temperature) {
|
||||
this.temperature = temperature;
|
||||
}
|
||||
|
||||
@XmlElement(name = "uuid")
|
||||
public String getUuid() {
|
||||
return uuid;
|
||||
}
|
||||
|
||||
public void setUuid(String uuid) {
|
||||
this.uuid = uuid;
|
||||
}
|
||||
|
||||
@XmlElement(name = "product_name")
|
||||
public String getProductName() {
|
||||
return productName;
|
||||
}
|
||||
|
||||
public void setProductName(String productName) {
|
||||
this.productName = productName;
|
||||
}
|
||||
|
||||
@XmlElement(name = "minor_number")
|
||||
public int getMinorNumber() {
|
||||
return minorNumber;
|
||||
}
|
||||
|
||||
public void setMinorNumber(int minorNumber) {
|
||||
this.minorNumber = minorNumber;
|
||||
}
|
||||
|
||||
@XmlElement(name = "utilization")
|
||||
public PerGpuUtilizations getGpuUtilizations() {
|
||||
return gpuUtilizations;
|
||||
}
|
||||
|
||||
public void setGpuUtilizations(PerGpuUtilizations utilizations) {
|
||||
this.gpuUtilizations = utilizations;
|
||||
}
|
||||
|
||||
@XmlElement(name = "bar1_memory_usage")
|
||||
public PerGpuMemoryUsage getGpuMemoryUsage() {
|
||||
return gpuMemoryUsage;
|
||||
}
|
||||
|
||||
public void setGpuMemoryUsage(PerGpuMemoryUsage gpuMemoryUsage) {
|
||||
this.gpuMemoryUsage = gpuMemoryUsage;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("ProductName=").append(productName).append(", MinorNumber=")
|
||||
.append(minorNumber);
|
||||
|
||||
if (getGpuMemoryUsage() != null) {
|
||||
sb.append(", TotalMemory=").append(
|
||||
getGpuMemoryUsage().getTotalMemoryMiB()).append("MiB");
|
||||
}
|
||||
|
||||
if (getGpuUtilizations() != null) {
|
||||
sb.append(", Utilization=").append(
|
||||
getGpuUtilizations().getOverallGpuUtilization()).append("%");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
|
||||
import javax.xml.bind.annotation.XmlElement;
|
||||
import javax.xml.bind.annotation.XmlRootElement;
|
||||
import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
|
||||
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
@XmlRootElement(name = "bar1_memory_usage")
|
||||
public class PerGpuMemoryUsage {
|
||||
long usedMemoryMiB = -1L;
|
||||
long availMemoryMiB = -1L;
|
||||
|
||||
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
|
||||
@XmlElement(name = "used")
|
||||
public Long getUsedMemoryMiB() {
|
||||
return usedMemoryMiB;
|
||||
}
|
||||
|
||||
public void setUsedMemoryMiB(Long usedMemoryMiB) {
|
||||
this.usedMemoryMiB = usedMemoryMiB;
|
||||
}
|
||||
|
||||
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
|
||||
@XmlElement(name = "free")
|
||||
public Long getAvailMemoryMiB() {
|
||||
return availMemoryMiB;
|
||||
}
|
||||
|
||||
public void setAvailMemoryMiB(Long availMemoryMiB) {
|
||||
this.availMemoryMiB = availMemoryMiB;
|
||||
}
|
||||
|
||||
public long getTotalMemoryMiB() {
|
||||
return usedMemoryMiB + availMemoryMiB;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* <p>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
|
||||
import javax.xml.bind.annotation.XmlElement;
|
||||
import javax.xml.bind.annotation.XmlRootElement;
|
||||
import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
|
||||
|
||||
/**
|
||||
* Temperature of GPU
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
@XmlRootElement(name = "temperature")
|
||||
public class PerGpuTemperature {
|
||||
private float currentGpuTemp = Float.MIN_VALUE;
|
||||
private float maxGpuTemp = Float.MIN_VALUE;
|
||||
private float slowThresholdGpuTemp = Float.MIN_VALUE;
|
||||
|
||||
/**
|
||||
* Get current celsius GPU temperature
|
||||
* @return temperature
|
||||
*/
|
||||
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
|
||||
@XmlElement(name = "gpu_temp")
|
||||
public Float getCurrentGpuTemp() {
|
||||
return currentGpuTemp;
|
||||
}
|
||||
|
||||
public void setCurrentGpuTemp(Float currentGpuTemp) {
|
||||
this.currentGpuTemp = currentGpuTemp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get max possible celsius GPU temperature
|
||||
* @return temperature
|
||||
*/
|
||||
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
|
||||
@XmlElement(name = "gpu_temp_max_threshold")
|
||||
public Float getMaxGpuTemp() {
|
||||
return maxGpuTemp;
|
||||
}
|
||||
|
||||
public void setMaxGpuTemp(Float maxGpuTemp) {
|
||||
this.maxGpuTemp = maxGpuTemp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get celsius GPU temperature which could make GPU runs slower
|
||||
* @return temperature
|
||||
*/
|
||||
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
|
||||
@XmlElement(name = "gpu_temp_slow_threshold")
|
||||
public Float getSlowThresholdGpuTemp() {
|
||||
return slowThresholdGpuTemp;
|
||||
}
|
||||
|
||||
public void setSlowThresholdGpuTemp(Float slowThresholdGpuTemp) {
|
||||
this.slowThresholdGpuTemp = slowThresholdGpuTemp;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
|
||||
import javax.xml.bind.annotation.XmlElement;
|
||||
import javax.xml.bind.annotation.XmlRootElement;
|
||||
import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
|
||||
|
||||
/**
|
||||
* GPU utilizations
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
@XmlRootElement(name = "utilization")
|
||||
public class PerGpuUtilizations {
|
||||
private float overallGpuUtilization;
|
||||
|
||||
/**
|
||||
* Overall percent GPU utilization
|
||||
* @return utilization
|
||||
*/
|
||||
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
|
||||
@XmlElement(name = "gpu_util")
|
||||
public Float getOverallGpuUtilization() {
|
||||
return overallGpuUtilization;
|
||||
}
|
||||
|
||||
public void setOverallGpuUtilization(Float overallGpuUtilization) {
|
||||
this.overallGpuUtilization = overallGpuUtilization;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||
import org.apache.hadoop.net.ServerSocketUtil;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||
import org.apache.hadoop.yarn.server.api.ResourceTracker;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
public class NodeManagerTestBase {
|
||||
// temp fix until metrics system can auto-detect itself running in unit test:
|
||||
static {
|
||||
DefaultMetricsSystem.setMiniClusterMode(true);
|
||||
}
|
||||
|
||||
protected static final Logger LOG =
|
||||
LoggerFactory.getLogger(TestNodeStatusUpdater.class);
|
||||
protected static final File basedir =
|
||||
new File("target", TestNodeStatusUpdater.class.getName());
|
||||
protected static final File nmLocalDir = new File(basedir, "nm0");
|
||||
protected static final File tmpDir = new File(basedir, "tmpDir");
|
||||
protected static final File remoteLogsDir = new File(basedir, "remotelogs");
|
||||
protected static final File logsDir = new File(basedir, "logs");
|
||||
protected static final RecordFactory recordFactory = RecordFactoryProvider
|
||||
.getRecordFactory(null);
|
||||
protected Configuration conf;
|
||||
|
||||
protected YarnConfiguration createNMConfig() throws IOException {
|
||||
return createNMConfig(ServerSocketUtil.getPort(49170, 10));
|
||||
}
|
||||
|
||||
protected YarnConfiguration createNMConfig(int port) throws IOException {
|
||||
YarnConfiguration conf = new YarnConfiguration();
|
||||
String localhostAddress = null;
|
||||
try {
|
||||
localhostAddress = InetAddress.getByName("localhost")
|
||||
.getCanonicalHostName();
|
||||
} catch (UnknownHostException e) {
|
||||
Assert.fail("Unable to get localhost address: " + e.getMessage());
|
||||
}
|
||||
conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
|
||||
conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
|
||||
conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
|
||||
+ ServerSocketUtil.getPort(49160, 10));
|
||||
conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
|
||||
conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
|
||||
remoteLogsDir.getAbsolutePath());
|
||||
conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
|
||||
conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
|
||||
return conf;
|
||||
}
|
||||
|
||||
public static class BaseResourceTrackerForTest implements ResourceTracker {
|
||||
@Override
|
||||
public RegisterNodeManagerResponse registerNodeManager(
|
||||
RegisterNodeManagerRequest request) throws YarnException, IOException {
|
||||
return new RegisterNodeManagerResponsePBImpl();
|
||||
}
|
||||
|
||||
@Override
|
||||
public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
|
||||
throws YarnException, IOException {
|
||||
return new NodeHeartbeatResponsePBImpl();
|
||||
}
|
||||
|
||||
@Override
|
||||
public UnRegisterNodeManagerResponse unRegisterNodeManager(
|
||||
UnRegisterNodeManagerRequest request)
|
||||
throws YarnException, IOException {
|
||||
return new UnRegisterNodeManagerResponsePBImpl();
|
||||
}
|
||||
}
|
||||
|
||||
protected static class BaseNodeStatusUpdaterForTest extends NodeStatusUpdaterImpl {
|
||||
public ResourceTracker resourceTracker;
|
||||
protected Context context;
|
||||
|
||||
public BaseNodeStatusUpdaterForTest(Context context, Dispatcher dispatcher,
|
||||
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
|
||||
ResourceTracker resourceTracker) {
|
||||
super(context, dispatcher, healthChecker, metrics);
|
||||
this.context = context;
|
||||
this.resourceTracker = resourceTracker;
|
||||
}
|
||||
@Override
|
||||
protected ResourceTracker getRMClient() {
|
||||
return resourceTracker;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void stopRMProxy() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
public class MyContainerManager extends ContainerManagerImpl {
|
||||
public boolean signaled = false;
|
||||
|
||||
public MyContainerManager(Context context, ContainerExecutor exec,
|
||||
DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
|
||||
NodeManagerMetrics metrics,
|
||||
LocalDirsHandlerService dirsHandler) {
|
||||
super(context, exec, deletionContext, nodeStatusUpdater,
|
||||
metrics, dirsHandler);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handle(ContainerManagerEvent event) {
|
||||
if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
|
||||
signaled = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() throws IOException {
|
||||
nmLocalDir.mkdirs();
|
||||
tmpDir.mkdirs();
|
||||
logsDir.mkdirs();
|
||||
remoteLogsDir.mkdirs();
|
||||
conf = createNMConfig();
|
||||
}
|
||||
}
|
|
@ -178,7 +178,7 @@ public class TestDefaultContainerExecutor {
|
|||
FileContext lfs = FileContext.getLocalFSFileContext(conf);
|
||||
DefaultContainerExecutor executor = new DefaultContainerExecutor(lfs);
|
||||
executor.setConf(conf);
|
||||
executor.init();
|
||||
executor.init(null);
|
||||
|
||||
try {
|
||||
executor.createUserLocalDirs(localDirs, user);
|
||||
|
@ -317,7 +317,7 @@ public class TestDefaultContainerExecutor {
|
|||
Path workDir = localDir;
|
||||
Path pidFile = new Path(workDir, "pid.txt");
|
||||
|
||||
mockExec.init();
|
||||
mockExec.init(null);
|
||||
mockExec.activateContainer(cId, pidFile);
|
||||
int ret = mockExec.launchContainer(new ContainerStartContext.Builder()
|
||||
.setContainer(container)
|
||||
|
|
|
@ -116,7 +116,7 @@ public class TestDockerContainerExecutorWithMocks {
|
|||
public void testContainerInitSecure() throws IOException {
|
||||
dockerContainerExecutor.getConf().set(
|
||||
CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
|
||||
dockerContainerExecutor.init();
|
||||
dockerContainerExecutor.init(mock(Context.class));
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
|
|
|
@ -628,7 +628,7 @@ public class TestLinuxContainerExecutor {
|
|||
LinuxContainerExecutor lce = new LinuxContainerExecutor();
|
||||
lce.setConf(conf);
|
||||
try {
|
||||
lce.init();
|
||||
lce.init(null);
|
||||
} catch (IOException e) {
|
||||
// expected if LCE isn't setup right, but not necessary for this test
|
||||
}
|
||||
|
|
|
@ -426,7 +426,7 @@ public class TestLinuxContainerExecutorWithMocks {
|
|||
@Test
|
||||
public void testInit() throws Exception {
|
||||
|
||||
mockExec.init();
|
||||
mockExec.init(mock(Context.class));
|
||||
assertEquals(Arrays.asList("--checksetup"), readMockParams());
|
||||
|
||||
}
|
||||
|
|
|
@ -40,7 +40,7 @@ public class TestNodeManager {
|
|||
public static final class InvalidContainerExecutor extends
|
||||
DefaultContainerExecutor {
|
||||
@Override
|
||||
public void init() throws IOException {
|
||||
public void init(Context nmContext) throws IOException {
|
||||
throw new IOException("dummy executor init called");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,16 +20,14 @@ package org.apache.hadoop.yarn.server.nodemanager;
|
|||
|
||||
import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.EOFException;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
@ -80,8 +78,6 @@ import org.apache.hadoop.yarn.event.Dispatcher;
|
|||
import org.apache.hadoop.yarn.event.EventHandler;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||
import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeHeartbeatResponseProto;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.ResourceTracker;
|
||||
|
@ -117,41 +113,14 @@ import org.junit.Before;
|
|||
import org.junit.Test;
|
||||
|
||||
@SuppressWarnings("rawtypes")
|
||||
public class TestNodeStatusUpdater {
|
||||
|
||||
// temp fix until metrics system can auto-detect itself running in unit test:
|
||||
static {
|
||||
DefaultMetricsSystem.setMiniClusterMode(true);
|
||||
}
|
||||
|
||||
static final Logger LOG =
|
||||
LoggerFactory.getLogger(TestNodeStatusUpdater.class);
|
||||
static final File basedir =
|
||||
new File("target", TestNodeStatusUpdater.class.getName());
|
||||
static final File nmLocalDir = new File(basedir, "nm0");
|
||||
static final File tmpDir = new File(basedir, "tmpDir");
|
||||
static final File remoteLogsDir = new File(basedir, "remotelogs");
|
||||
static final File logsDir = new File(basedir, "logs");
|
||||
private static final RecordFactory recordFactory = RecordFactoryProvider
|
||||
.getRecordFactory(null);
|
||||
|
||||
public class TestNodeStatusUpdater extends NodeManagerTestBase {
|
||||
volatile int heartBeatID = 0;
|
||||
volatile Throwable nmStartError = null;
|
||||
private final List<NodeId> registeredNodes = new ArrayList<NodeId>();
|
||||
private boolean triggered = false;
|
||||
private Configuration conf;
|
||||
private NodeManager nm;
|
||||
private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false);
|
||||
|
||||
@Before
|
||||
public void setUp() throws IOException {
|
||||
nmLocalDir.mkdirs();
|
||||
tmpDir.mkdirs();
|
||||
logsDir.mkdirs();
|
||||
remoteLogsDir.mkdirs();
|
||||
conf = createNMConfig();
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() {
|
||||
this.registeredNodes.clear();
|
||||
|
@ -332,29 +301,7 @@ public class TestNodeStatusUpdater {
|
|||
}
|
||||
}
|
||||
|
||||
private class MyContainerManager extends ContainerManagerImpl {
|
||||
public boolean signaled = false;
|
||||
|
||||
public MyContainerManager(Context context, ContainerExecutor exec,
|
||||
DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
|
||||
NodeManagerMetrics metrics,
|
||||
LocalDirsHandlerService dirsHandler) {
|
||||
super(context, exec, deletionContext, nodeStatusUpdater,
|
||||
metrics, dirsHandler);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handle(ContainerManagerEvent event) {
|
||||
if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
|
||||
signaled = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class MyNodeStatusUpdater extends NodeStatusUpdaterImpl {
|
||||
public ResourceTracker resourceTracker;
|
||||
private Context context;
|
||||
|
||||
private class MyNodeStatusUpdater extends BaseNodeStatusUpdaterForTest {
|
||||
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
|
||||
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
|
||||
this(context, dispatcher, healthChecker, metrics, false);
|
||||
|
@ -363,19 +310,8 @@ public class TestNodeStatusUpdater {
|
|||
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
|
||||
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
|
||||
boolean signalContainer) {
|
||||
super(context, dispatcher, healthChecker, metrics);
|
||||
this.context = context;
|
||||
resourceTracker = new MyResourceTracker(this.context, signalContainer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ResourceTracker getRMClient() {
|
||||
return resourceTracker;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void stopRMProxy() {
|
||||
return;
|
||||
super(context, dispatcher, healthChecker, metrics,
|
||||
new MyResourceTracker(context, signalContainer));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1818,7 +1754,6 @@ public class TestNodeStatusUpdater {
|
|||
Assert.assertTrue("Test failed with exception(s)" + exceptions,
|
||||
exceptions.isEmpty());
|
||||
}
|
||||
|
||||
// Add new containers info into NM context each time node heart beats.
|
||||
private class MyNMContext extends NMContext {
|
||||
|
||||
|
@ -1922,31 +1857,6 @@ public class TestNodeStatusUpdater {
|
|||
this.registeredNodes.size());
|
||||
}
|
||||
|
||||
private YarnConfiguration createNMConfig(int port) throws IOException {
|
||||
YarnConfiguration conf = new YarnConfiguration();
|
||||
String localhostAddress = null;
|
||||
try {
|
||||
localhostAddress = InetAddress.getByName("localhost")
|
||||
.getCanonicalHostName();
|
||||
} catch (UnknownHostException e) {
|
||||
Assert.fail("Unable to get localhost address: " + e.getMessage());
|
||||
}
|
||||
conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
|
||||
conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
|
||||
conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
|
||||
+ ServerSocketUtil.getPort(49160, 10));
|
||||
conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
|
||||
conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
|
||||
remoteLogsDir.getAbsolutePath());
|
||||
conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
|
||||
conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
|
||||
return conf;
|
||||
}
|
||||
|
||||
private YarnConfiguration createNMConfig() throws IOException {
|
||||
return createNMConfig(ServerSocketUtil.getPort(49170, 10));
|
||||
}
|
||||
|
||||
private NodeManager getNodeManager(final NodeAction nodeHeartBeatAction) {
|
||||
return new NodeManager() {
|
||||
@Override
|
||||
|
|
|
@ -18,26 +18,6 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.security.PrivilegedExceptionAction;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.ExecutorCompletionService;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
|
@ -66,6 +46,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
|
|||
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
|
||||
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
|
||||
|
@ -74,18 +55,37 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
|
||||
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
|
||||
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
|
||||
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
|
||||
import org.apache.hadoop.yarn.util.Records;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.security.PrivilegedExceptionAction;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.ExecutorCompletionService;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* Base class for all the AMRMProxyService test cases. It provides utility
|
||||
|
@ -805,5 +805,9 @@ public abstract class BaseAMRMProxyTest {
|
|||
public NMTimelinePublisher getNMTimelinePublisher() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public ResourcePluginManager getResourcePluginManager() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
|
|||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
@ -30,6 +31,8 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import java.util.List;
|
||||
|
||||
import static org.mockito.Mockito.mock;
|
||||
|
||||
public class TestResourceHandlerModule {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(TestResourceHandlerModule.class);
|
||||
|
@ -62,7 +65,7 @@ public class TestResourceHandlerModule {
|
|||
|
||||
//Ensure that outbound bandwidth resource handler is present in the chain
|
||||
ResourceHandlerChain resourceHandlerChain = ResourceHandlerModule
|
||||
.getConfiguredResourceHandlerChain(networkEnabledConf);
|
||||
.getConfiguredResourceHandlerChain(networkEnabledConf, mock(Context.class));
|
||||
List<ResourceHandler> resourceHandlers = resourceHandlerChain
|
||||
.getResourceHandlerList();
|
||||
//Exactly one resource handler in chain
|
||||
|
@ -88,7 +91,8 @@ public class TestResourceHandlerModule {
|
|||
Assert.assertNotNull(handler);
|
||||
|
||||
ResourceHandlerChain resourceHandlerChain =
|
||||
ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf);
|
||||
ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf,
|
||||
mock(Context.class));
|
||||
List<ResourceHandler> resourceHandlers =
|
||||
resourceHandlerChain.getResourceHandlerList();
|
||||
// Exactly one resource handler in chain
|
||||
|
|
|
@ -0,0 +1,385 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import static org.mockito.Matchers.any;
|
||||
import static org.mockito.Matchers.anyList;
|
||||
import static org.mockito.Matchers.anyListOf;
|
||||
import static org.mockito.Matchers.anyString;
|
||||
import static org.mockito.Matchers.eq;
|
||||
import static org.mockito.Mockito.doThrow;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.never;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class TestGpuResourceHandler {
|
||||
private CGroupsHandler mockCGroupsHandler;
|
||||
private PrivilegedOperationExecutor mockPrivilegedExecutor;
|
||||
private GpuResourceHandlerImpl gpuResourceHandler;
|
||||
private NMStateStoreService mockNMStateStore;
|
||||
private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
|
||||
|
||||
mockCGroupsHandler = mock(CGroupsHandler.class);
|
||||
mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
|
||||
mockNMStateStore = mock(NMStateStoreService.class);
|
||||
|
||||
Context nmctx = mock(Context.class);
|
||||
when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore);
|
||||
runningContainersMap = new ConcurrentHashMap<>();
|
||||
when(nmctx.getContainers()).thenReturn(runningContainersMap);
|
||||
|
||||
gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
|
||||
mockPrivilegedExecutor);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBootStrap() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0");
|
||||
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
verify(mockCGroupsHandler, times(1)).initializeCGroupController(
|
||||
CGroupsHandler.CGroupController.DEVICES);
|
||||
}
|
||||
|
||||
private static ContainerId getContainerId(int id) {
|
||||
return ContainerId.newContainerId(ApplicationAttemptId
|
||||
.newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
|
||||
}
|
||||
|
||||
private static Container mockContainerWithGpuRequest(int id,
|
||||
int numGpuRequest) {
|
||||
Container c = mock(Container.class);
|
||||
when(c.getContainerId()).thenReturn(getContainerId(id));
|
||||
|
||||
Resource res = Resource.newInstance(1024, 1);
|
||||
ResourceMappings resMapping = new ResourceMappings();
|
||||
|
||||
res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
|
||||
when(c.getResource()).thenReturn(res);
|
||||
when(c.getResourceMappings()).thenReturn(resMapping);
|
||||
return c;
|
||||
}
|
||||
|
||||
private void verifyDeniedDevices(ContainerId containerId,
|
||||
List<Integer> deniedDevices)
|
||||
throws ResourceHandlerException, PrivilegedOperationException {
|
||||
verify(mockCGroupsHandler, times(1)).createCGroup(
|
||||
CGroupsHandler.CGroupController.DEVICES, containerId.toString());
|
||||
|
||||
if (null != deniedDevices && !deniedDevices.isEmpty()) {
|
||||
verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
|
||||
new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
|
||||
.asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
|
||||
containerId.toString(),
|
||||
GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
|
||||
StringUtils.join(",", deniedDevices))), true);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllocation() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(4,
|
||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
|
||||
/* Start container 1, asks 3 containers */
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
|
||||
|
||||
// Only device=4 will be blocked.
|
||||
verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
|
||||
|
||||
/* Start container 2, asks 2 containers. Excepted to fail */
|
||||
boolean failedToAllocate = false;
|
||||
try {
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2));
|
||||
} catch (ResourceHandlerException e) {
|
||||
failedToAllocate = true;
|
||||
}
|
||||
Assert.assertTrue(failedToAllocate);
|
||||
|
||||
/* Start container 3, ask 1 container, succeeded */
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1));
|
||||
|
||||
// devices = 0/1/3 will be blocked
|
||||
verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3));
|
||||
|
||||
/* Start container 4, ask 0 container, succeeded */
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0));
|
||||
|
||||
// All devices will be blocked
|
||||
verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4));
|
||||
|
||||
/* Release container-1, expect cgroups deleted */
|
||||
gpuResourceHandler.postComplete(getContainerId(1));
|
||||
|
||||
verify(mockCGroupsHandler, times(1)).createCGroup(
|
||||
CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
|
||||
Assert.assertEquals(3,
|
||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
|
||||
/* Release container-3, expect cgroups deleted */
|
||||
gpuResourceHandler.postComplete(getContainerId(3));
|
||||
|
||||
verify(mockCGroupsHandler, times(1)).createCGroup(
|
||||
CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString());
|
||||
Assert.assertEquals(4,
|
||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
|
||||
throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(4,
|
||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
|
||||
doThrow(new IOException("Exception ...")).when(mockNMStateStore)
|
||||
.storeAssignedResources(
|
||||
any(ContainerId.class), anyString(), anyList());
|
||||
|
||||
boolean exception = false;
|
||||
/* Start container 1, asks 3 containers */
|
||||
try {
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
|
||||
} catch (ResourceHandlerException e) {
|
||||
exception = true;
|
||||
}
|
||||
|
||||
Assert.assertTrue("preStart should throw exception", exception);
|
||||
|
||||
// After preStart, we still have 4 available GPU since the store op fails.
|
||||
Assert.assertEquals(4,
|
||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllocationWithoutAllowedGpus() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(0,
|
||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
|
||||
/* Start container 1, asks 0 containers */
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
|
||||
verifyDeniedDevices(getContainerId(1), Collections.<Integer>emptyList());
|
||||
|
||||
/* Start container 2, asks 1 containers. Excepted to fail */
|
||||
boolean failedToAllocate = false;
|
||||
try {
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1));
|
||||
} catch (ResourceHandlerException e) {
|
||||
failedToAllocate = true;
|
||||
}
|
||||
Assert.assertTrue(failedToAllocate);
|
||||
|
||||
/* Release container 1, expect cgroups deleted */
|
||||
gpuResourceHandler.postComplete(getContainerId(1));
|
||||
|
||||
verify(mockCGroupsHandler, times(1)).createCGroup(
|
||||
CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
|
||||
Assert.assertEquals(0,
|
||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllocationStored() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(4,
|
||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
|
||||
/* Start container 1, asks 3 containers */
|
||||
Container container = mockContainerWithGpuRequest(1, 3);
|
||||
gpuResourceHandler.preStart(container);
|
||||
|
||||
verify(mockNMStateStore).storeAssignedResources(getContainerId(1),
|
||||
ResourceInformation.GPU_URI,
|
||||
Arrays.<Serializable>asList("0", "1", "3"));
|
||||
|
||||
Assert.assertEquals(3, container.getResourceMappings()
|
||||
.getAssignedResources(ResourceInformation.GPU_URI).size());
|
||||
|
||||
// Only device=4 will be blocked.
|
||||
verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
|
||||
|
||||
/* Start container 2, ask 0 container, succeeded */
|
||||
container = mockContainerWithGpuRequest(2, 0);
|
||||
gpuResourceHandler.preStart(container);
|
||||
|
||||
verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4));
|
||||
Assert.assertEquals(0, container.getResourceMappings()
|
||||
.getAssignedResources(ResourceInformation.GPU_URI).size());
|
||||
|
||||
// Store assigned resource will not be invoked.
|
||||
verify(mockNMStateStore, never()).storeAssignedResources(
|
||||
eq(getContainerId(2)), eq(ResourceInformation.GPU_URI),
|
||||
anyListOf(Serializable.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRecoverResourceAllocation() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(4,
|
||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
|
||||
Container nmContainer = mock(Container.class);
|
||||
ResourceMappings rmap = new ResourceMappings();
|
||||
ResourceMappings.AssignedResources ar =
|
||||
new ResourceMappings.AssignedResources();
|
||||
ar.updateAssignedResources(Arrays.<Serializable>asList("1", "3"));
|
||||
rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
|
||||
when(nmContainer.getResourceMappings()).thenReturn(rmap);
|
||||
|
||||
runningContainersMap.put(getContainerId(1), nmContainer);
|
||||
|
||||
// TEST CASE
|
||||
// Reacquire container restore state of GPU Resource Allocator.
|
||||
gpuResourceHandler.reacquireContainer(getContainerId(1));
|
||||
|
||||
Map<Integer, ContainerId> deviceAllocationMapping =
|
||||
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
|
||||
Assert.assertEquals(2, deviceAllocationMapping.size());
|
||||
Assert.assertTrue(
|
||||
deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
|
||||
Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
|
||||
|
||||
// TEST CASE
|
||||
// Try to reacquire a container but requested device is not in allowed list.
|
||||
nmContainer = mock(Container.class);
|
||||
rmap = new ResourceMappings();
|
||||
ar = new ResourceMappings.AssignedResources();
|
||||
// id=5 is not in allowed list.
|
||||
ar.updateAssignedResources(Arrays.<Serializable>asList("4", "5"));
|
||||
rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
|
||||
when(nmContainer.getResourceMappings()).thenReturn(rmap);
|
||||
|
||||
runningContainersMap.put(getContainerId(2), nmContainer);
|
||||
|
||||
boolean caughtException = false;
|
||||
try {
|
||||
gpuResourceHandler.reacquireContainer(getContainerId(1));
|
||||
} catch (ResourceHandlerException e) {
|
||||
caughtException = true;
|
||||
}
|
||||
Assert.assertTrue(
|
||||
"Should fail since requested device Id is not in allowed list",
|
||||
caughtException);
|
||||
|
||||
// Make sure internal state not changed.
|
||||
deviceAllocationMapping =
|
||||
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
|
||||
Assert.assertEquals(2, deviceAllocationMapping.size());
|
||||
Assert.assertTrue(
|
||||
deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
|
||||
Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
|
||||
|
||||
// TEST CASE
|
||||
// Try to reacquire a container but requested device is already assigned.
|
||||
nmContainer = mock(Container.class);
|
||||
rmap = new ResourceMappings();
|
||||
ar = new ResourceMappings.AssignedResources();
|
||||
// id=3 is already assigned
|
||||
ar.updateAssignedResources(Arrays.<Serializable>asList("4", "3"));
|
||||
rmap.addAssignedResources("gpu", ar);
|
||||
when(nmContainer.getResourceMappings()).thenReturn(rmap);
|
||||
|
||||
runningContainersMap.put(getContainerId(2), nmContainer);
|
||||
|
||||
caughtException = false;
|
||||
try {
|
||||
gpuResourceHandler.reacquireContainer(getContainerId(1));
|
||||
} catch (ResourceHandlerException e) {
|
||||
caughtException = true;
|
||||
}
|
||||
Assert.assertTrue(
|
||||
"Should fail since requested device Id is not in allowed list",
|
||||
caughtException);
|
||||
|
||||
// Make sure internal state not changed.
|
||||
deviceAllocationMapping =
|
||||
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
|
||||
Assert.assertEquals(2, deviceAllocationMapping.size());
|
||||
Assert.assertTrue(
|
||||
deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
|
||||
Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
|
||||
}
|
||||
}
|
|
@ -70,7 +70,7 @@ public class TestContainersMonitorResourceChange {
|
|||
|
||||
private static class MockExecutor extends ContainerExecutor {
|
||||
@Override
|
||||
public void init() throws IOException {
|
||||
public void init(Context nmContext) throws IOException {
|
||||
}
|
||||
@Override
|
||||
public void startLocalizer(LocalizerStartContext ctx)
|
||||
|
|
|
@ -0,0 +1,261 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.service.ServiceOperations;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
|
||||
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.mockito.Matchers.any;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class TestResourcePluginManager extends NodeManagerTestBase {
|
||||
private NodeManager nm;
|
||||
|
||||
ResourcePluginManager stubResourcePluginmanager() {
|
||||
// Stub ResourcePluginManager
|
||||
final ResourcePluginManager rpm = mock(ResourcePluginManager.class);
|
||||
Map<String, ResourcePlugin> plugins = new HashMap<>();
|
||||
|
||||
// First resource plugin
|
||||
ResourcePlugin resourcePlugin = mock(ResourcePlugin.class);
|
||||
NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin = mock(
|
||||
NodeResourceUpdaterPlugin.class);
|
||||
when(resourcePlugin.getNodeResourceHandlerInstance()).thenReturn(
|
||||
nodeResourceUpdaterPlugin);
|
||||
plugins.put("resource1", resourcePlugin);
|
||||
|
||||
// Second resource plugin
|
||||
resourcePlugin = mock(ResourcePlugin.class);
|
||||
when(resourcePlugin.createResourceHandler(any(Context.class), any(
|
||||
CGroupsHandler.class), any(PrivilegedOperationExecutor.class)))
|
||||
.thenReturn(new CustomizedResourceHandler());
|
||||
plugins.put("resource2", resourcePlugin);
|
||||
when(rpm.getNameToPlugins()).thenReturn(plugins);
|
||||
return rpm;
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() {
|
||||
if (nm != null) {
|
||||
try {
|
||||
ServiceOperations.stop(nm);
|
||||
} catch (Throwable t) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class CustomizedResourceHandler implements ResourceHandler {
|
||||
|
||||
@Override
|
||||
public List<PrivilegedOperation> bootstrap(Configuration configuration)
|
||||
throws ResourceHandlerException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PrivilegedOperation> preStart(Container container)
|
||||
throws ResourceHandlerException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PrivilegedOperation> reacquireContainer(ContainerId containerId)
|
||||
throws ResourceHandlerException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PrivilegedOperation> postComplete(ContainerId containerId)
|
||||
throws ResourceHandlerException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PrivilegedOperation> teardown()
|
||||
throws ResourceHandlerException {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private class MyMockNM extends NodeManager {
|
||||
private final ResourcePluginManager rpm;
|
||||
|
||||
public MyMockNM(ResourcePluginManager rpm) {
|
||||
this.rpm = rpm;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected NodeStatusUpdater createNodeStatusUpdater(Context context,
|
||||
Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
|
||||
((NodeManager.NMContext)context).setResourcePluginManager(rpm);
|
||||
return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
|
||||
metrics, new BaseResourceTrackerForTest());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ContainerManagerImpl createContainerManager(Context context,
|
||||
ContainerExecutor exec, DeletionService del,
|
||||
NodeStatusUpdater nodeStatusUpdater,
|
||||
ApplicationACLsManager aclsManager,
|
||||
LocalDirsHandlerService diskhandler) {
|
||||
return new MyContainerManager(context, exec, del, nodeStatusUpdater,
|
||||
metrics, diskhandler);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ResourcePluginManager createResourcePluginManager() {
|
||||
return rpm;
|
||||
}
|
||||
}
|
||||
|
||||
public class MyLCE extends LinuxContainerExecutor {
|
||||
private PrivilegedOperationExecutor poe = mock(PrivilegedOperationExecutor.class);
|
||||
|
||||
@Override
|
||||
protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
|
||||
return poe;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure ResourcePluginManager is initialized during NM start up.
|
||||
*/
|
||||
@Test(timeout = 30000)
|
||||
public void testResourcePluginManagerInitialization() throws Exception {
|
||||
final ResourcePluginManager rpm = stubResourcePluginmanager();
|
||||
nm = new MyMockNM(rpm);
|
||||
|
||||
YarnConfiguration conf = createNMConfig();
|
||||
nm.init(conf);
|
||||
verify(rpm, times(1)).initialize(
|
||||
any(Context.class));
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure ResourcePluginManager is invoked during NM update.
|
||||
*/
|
||||
@Test(timeout = 30000)
|
||||
public void testNodeStatusUpdaterWithResourcePluginsEnabled() throws Exception {
|
||||
final ResourcePluginManager rpm = stubResourcePluginmanager();
|
||||
|
||||
nm = new MyMockNM(rpm);
|
||||
|
||||
YarnConfiguration conf = createNMConfig();
|
||||
nm.init(conf);
|
||||
nm.start();
|
||||
|
||||
NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin =
|
||||
rpm.getNameToPlugins().get("resource1")
|
||||
.getNodeResourceHandlerInstance();
|
||||
|
||||
verify(nodeResourceUpdaterPlugin, times(1)).updateConfiguredResource(
|
||||
any(Resource.class));
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure ResourcePluginManager is used to initialize ResourceHandlerChain
|
||||
*/
|
||||
@Test(timeout = 30000)
|
||||
public void testLinuxContainerExecutorWithResourcePluginsEnabled() throws Exception {
|
||||
final ResourcePluginManager rpm = stubResourcePluginmanager();
|
||||
final LinuxContainerExecutor lce = new MyLCE();
|
||||
|
||||
nm = new NodeManager() {
|
||||
@Override
|
||||
protected NodeStatusUpdater createNodeStatusUpdater(Context context,
|
||||
Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
|
||||
((NMContext)context).setResourcePluginManager(rpm);
|
||||
return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
|
||||
metrics, new BaseResourceTrackerForTest());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ContainerManagerImpl createContainerManager(Context context,
|
||||
ContainerExecutor exec, DeletionService del,
|
||||
NodeStatusUpdater nodeStatusUpdater,
|
||||
ApplicationACLsManager aclsManager,
|
||||
LocalDirsHandlerService diskhandler) {
|
||||
return new MyContainerManager(context, exec, del, nodeStatusUpdater,
|
||||
metrics, diskhandler);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ContainerExecutor createContainerExecutor(Configuration conf) {
|
||||
((NMContext)this.getNMContext()).setResourcePluginManager(rpm);
|
||||
lce.setConf(conf);
|
||||
return lce;
|
||||
}
|
||||
};
|
||||
|
||||
YarnConfiguration conf = createNMConfig();
|
||||
|
||||
nm.init(conf);
|
||||
nm.start();
|
||||
|
||||
ResourceHandler handler = lce.getResourceHandler();
|
||||
Assert.assertNotNull(handler);
|
||||
Assert.assertTrue(handler instanceof ResourceHandlerChain);
|
||||
|
||||
boolean newHandlerAdded = false;
|
||||
for (ResourceHandler h : ((ResourceHandlerChain) handler)
|
||||
.getResourceHandlerList()) {
|
||||
if (h instanceof CustomizedResourceHandler) {
|
||||
newHandlerAdded = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Assert.assertTrue("New ResourceHandler should be added", newHandlerAdded);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Assume;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class TestGpuDiscoverer {
|
||||
private String getTestParentFolder() {
|
||||
File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
|
||||
return f.getAbsolutePath();
|
||||
}
|
||||
|
||||
private void touchFile(File f) throws IOException {
|
||||
new FileOutputStream(f).close();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void before() throws IOException {
|
||||
String folder = getTestParentFolder();
|
||||
File f = new File(folder);
|
||||
FileUtils.deleteDirectory(f);
|
||||
f.mkdirs();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
|
||||
// Only run this on demand.
|
||||
Assume.assumeTrue(Boolean.valueOf(
|
||||
System.getProperty("RunLinuxGpuResourceDiscoverPluginConfigTest")));
|
||||
|
||||
// test case 1, check default setting.
|
||||
Configuration conf = new Configuration(false);
|
||||
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||
plugin.initialize(conf);
|
||||
Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
|
||||
plugin.getPathOfGpuBinary());
|
||||
Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
|
||||
Assert.assertTrue(
|
||||
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
|
||||
|
||||
// test case 2, check mandatory set path.
|
||||
File fakeBinary = new File(getTestParentFolder(),
|
||||
GpuDiscoverer.DEFAULT_BINARY_NAME);
|
||||
touchFile(fakeBinary);
|
||||
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
|
||||
plugin = new GpuDiscoverer();
|
||||
plugin.initialize(conf);
|
||||
Assert.assertEquals(fakeBinary.getAbsolutePath(),
|
||||
plugin.getPathOfGpuBinary());
|
||||
Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
|
||||
|
||||
// test case 3, check mandatory set path, but binary doesn't exist so default
|
||||
// path will be used.
|
||||
fakeBinary.delete();
|
||||
plugin = new GpuDiscoverer();
|
||||
plugin.initialize(conf);
|
||||
Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
|
||||
plugin.getPathOfGpuBinary());
|
||||
Assert.assertTrue(
|
||||
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGpuDiscover() throws YarnException {
|
||||
// Since this is more of a performance unit test, only run if
|
||||
// RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
|
||||
Assume.assumeTrue(
|
||||
Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
|
||||
Configuration conf = new Configuration(false);
|
||||
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||
plugin.initialize(conf);
|
||||
GpuDeviceInformation info = plugin.getGpuDeviceInformation();
|
||||
|
||||
Assert.assertTrue(info.getGpus().size() > 0);
|
||||
Assert.assertEquals(plugin.getMinorNumbersOfGpusUsableByYarn().size(),
|
||||
info.getGpus().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void getNumberOfUsableGpusFromConfig() throws YarnException {
|
||||
Configuration conf = new Configuration(false);
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,2,4");
|
||||
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||
plugin.initialize(conf);
|
||||
|
||||
List<Integer> minorNumbers = plugin.getMinorNumbersOfGpusUsableByYarn();
|
||||
Assert.assertEquals(4, minorNumbers.size());
|
||||
|
||||
Assert.assertTrue(0 == minorNumbers.get(0));
|
||||
Assert.assertTrue(1 == minorNumbers.get(1));
|
||||
Assert.assertTrue(2 == minorNumbers.get(2));
|
||||
Assert.assertTrue(4 == minorNumbers.get(3));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestGpuDeviceInformationParser {
|
||||
@Test
|
||||
public void testParse() throws IOException, YarnException {
|
||||
File f = new File("src/test/resources/nvidia-smi-sample-xml-output");
|
||||
String s = FileUtils.readFileToString(f, "UTF-8");
|
||||
|
||||
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
|
||||
|
||||
GpuDeviceInformation info = parser.parseXml(s);
|
||||
Assert.assertEquals("375.66", info.getDriverVersion());
|
||||
Assert.assertEquals(2, info.getGpus().size());
|
||||
PerGpuDeviceInformation gpu1 = info.getGpus().get(1);
|
||||
Assert.assertEquals("Tesla P100-PCIE-12GB", gpu1.getProductName());
|
||||
Assert.assertEquals(16384, gpu1.getGpuMemoryUsage().getTotalMemoryMiB());
|
||||
Assert.assertEquals(10.3f,
|
||||
gpu1.getGpuUtilizations().getOverallGpuUtilization(), 1e-6);
|
||||
Assert.assertEquals(34f, gpu1.getTemperature().getCurrentGpuTemp(), 1e-6);
|
||||
Assert.assertEquals(85f, gpu1.getTemperature().getMaxGpuTemp(), 1e-6);
|
||||
Assert.assertEquals(82f, gpu1.getTemperature().getSlowThresholdGpuTemp(),
|
||||
1e-6);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,547 @@
|
|||
<?xml version="1.0" ?>
|
||||
<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v8.dtd">
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<nvidia_smi_log>
|
||||
<timestamp>Wed Sep 6 21:52:51 2017</timestamp>
|
||||
<driver_version>375.66</driver_version>
|
||||
<attached_gpus>2</attached_gpus>
|
||||
<gpu id="0000:04:00.0">
|
||||
<product_name>Tesla P100-PCIE-12GB</product_name>
|
||||
<product_brand>Tesla</product_brand>
|
||||
<display_mode>Disabled</display_mode>
|
||||
<display_active>Disabled</display_active>
|
||||
<persistence_mode>Disabled</persistence_mode>
|
||||
<accounting_mode>Disabled</accounting_mode>
|
||||
<accounting_mode_buffer_size>1920</accounting_mode_buffer_size>
|
||||
<driver_model>
|
||||
<current_dm>N/A</current_dm>
|
||||
<pending_dm>N/A</pending_dm>
|
||||
</driver_model>
|
||||
<serial>0320717030197</serial>
|
||||
<uuid>GPU-28604e81-21ec-cc48-6759-bf2648b22e16</uuid>
|
||||
<minor_number>0</minor_number>
|
||||
<vbios_version>86.00.3A.00.02</vbios_version>
|
||||
<multigpu_board>No</multigpu_board>
|
||||
<board_id>0x400</board_id>
|
||||
<gpu_part_number>900-2H400-0110-030</gpu_part_number>
|
||||
<inforom_version>
|
||||
<img_version>H400.0202.00.01</img_version>
|
||||
<oem_object>1.1</oem_object>
|
||||
<ecc_object>4.1</ecc_object>
|
||||
<pwr_object>N/A</pwr_object>
|
||||
</inforom_version>
|
||||
<gpu_operation_mode>
|
||||
<current_gom>N/A</current_gom>
|
||||
<pending_gom>N/A</pending_gom>
|
||||
</gpu_operation_mode>
|
||||
<gpu_virtualization_mode>
|
||||
<virtualization_mode>None</virtualization_mode>
|
||||
</gpu_virtualization_mode>
|
||||
<pci>
|
||||
<pci_bus>04</pci_bus>
|
||||
<pci_device>00</pci_device>
|
||||
<pci_domain>0000</pci_domain>
|
||||
<pci_device_id>15F710DE</pci_device_id>
|
||||
<pci_bus_id>0000:04:00.0</pci_bus_id>
|
||||
<pci_sub_system_id>11DA10DE</pci_sub_system_id>
|
||||
<pci_gpu_link_info>
|
||||
<pcie_gen>
|
||||
<max_link_gen>3</max_link_gen>
|
||||
<current_link_gen>3</current_link_gen>
|
||||
</pcie_gen>
|
||||
<link_widths>
|
||||
<max_link_width>16x</max_link_width>
|
||||
<current_link_width>16x</current_link_width>
|
||||
</link_widths>
|
||||
</pci_gpu_link_info>
|
||||
<pci_bridge_chip>
|
||||
<bridge_chip_type>N/A</bridge_chip_type>
|
||||
<bridge_chip_fw>N/A</bridge_chip_fw>
|
||||
</pci_bridge_chip>
|
||||
<replay_counter>0</replay_counter>
|
||||
<tx_util>0 KB/s</tx_util>
|
||||
<rx_util>0 KB/s</rx_util>
|
||||
</pci>
|
||||
<fan_speed>N/A</fan_speed>
|
||||
<performance_state>P0</performance_state>
|
||||
<clocks_throttle_reasons>
|
||||
<clocks_throttle_reason_gpu_idle>Active</clocks_throttle_reason_gpu_idle>
|
||||
<clocks_throttle_reason_applications_clocks_setting>Not Active</clocks_throttle_reason_applications_clocks_setting>
|
||||
<clocks_throttle_reason_sw_power_cap>Not Active</clocks_throttle_reason_sw_power_cap>
|
||||
<clocks_throttle_reason_hw_slowdown>Not Active</clocks_throttle_reason_hw_slowdown>
|
||||
<clocks_throttle_reason_sync_boost>Not Active</clocks_throttle_reason_sync_boost>
|
||||
<clocks_throttle_reason_unknown>Not Active</clocks_throttle_reason_unknown>
|
||||
</clocks_throttle_reasons>
|
||||
<fb_memory_usage>
|
||||
<total>12193 MiB</total>
|
||||
<used>0 MiB</used>
|
||||
<free>12193 MiB</free>
|
||||
</fb_memory_usage>
|
||||
<bar1_memory_usage>
|
||||
<total>16384 MiB</total>
|
||||
<used>2 MiB</used>
|
||||
<free>16382 MiB</free>
|
||||
</bar1_memory_usage>
|
||||
<compute_mode>Default</compute_mode>
|
||||
<utilization>
|
||||
<gpu_util>0 %</gpu_util>
|
||||
<memory_util>0 %</memory_util>
|
||||
<encoder_util>0 %</encoder_util>
|
||||
<decoder_util>0 %</decoder_util>
|
||||
</utilization>
|
||||
<encoder_stats>
|
||||
<session_count>0</session_count>
|
||||
<average_fps>0</average_fps>
|
||||
<average_latency>0 ms</average_latency>
|
||||
</encoder_stats>
|
||||
<ecc_mode>
|
||||
<current_ecc>Enabled</current_ecc>
|
||||
<pending_ecc>Enabled</pending_ecc>
|
||||
</ecc_mode>
|
||||
<ecc_errors>
|
||||
<volatile>
|
||||
<single_bit>
|
||||
<device_memory>0</device_memory>
|
||||
<register_file>0</register_file>
|
||||
<l1_cache>N/A</l1_cache>
|
||||
<l2_cache>0</l2_cache>
|
||||
<texture_memory>0</texture_memory>
|
||||
<texture_shm>0</texture_shm>
|
||||
<total>0</total>
|
||||
</single_bit>
|
||||
<double_bit>
|
||||
<device_memory>0</device_memory>
|
||||
<register_file>0</register_file>
|
||||
<l1_cache>N/A</l1_cache>
|
||||
<l2_cache>0</l2_cache>
|
||||
<texture_memory>0</texture_memory>
|
||||
<texture_shm>0</texture_shm>
|
||||
<total>0</total>
|
||||
</double_bit>
|
||||
</volatile>
|
||||
<aggregate>
|
||||
<single_bit>
|
||||
<device_memory>0</device_memory>
|
||||
<register_file>0</register_file>
|
||||
<l1_cache>N/A</l1_cache>
|
||||
<l2_cache>0</l2_cache>
|
||||
<texture_memory>0</texture_memory>
|
||||
<texture_shm>0</texture_shm>
|
||||
<total>0</total>
|
||||
</single_bit>
|
||||
<double_bit>
|
||||
<device_memory>0</device_memory>
|
||||
<register_file>0</register_file>
|
||||
<l1_cache>N/A</l1_cache>
|
||||
<l2_cache>0</l2_cache>
|
||||
<texture_memory>0</texture_memory>
|
||||
<texture_shm>0</texture_shm>
|
||||
<total>0</total>
|
||||
</double_bit>
|
||||
</aggregate>
|
||||
</ecc_errors>
|
||||
<retired_pages>
|
||||
<multiple_single_bit_retirement>
|
||||
<retired_count>0</retired_count>
|
||||
<retired_page_addresses>
|
||||
</retired_page_addresses>
|
||||
</multiple_single_bit_retirement>
|
||||
<double_bit_retirement>
|
||||
<retired_count>0</retired_count>
|
||||
<retired_page_addresses>
|
||||
</retired_page_addresses>
|
||||
</double_bit_retirement>
|
||||
<pending_retirement>No</pending_retirement>
|
||||
</retired_pages>
|
||||
<temperature>
|
||||
<gpu_temp>31 C</gpu_temp>
|
||||
<gpu_temp_max_threshold>85 C</gpu_temp_max_threshold>
|
||||
<gpu_temp_slow_threshold>82 C</gpu_temp_slow_threshold>
|
||||
</temperature>
|
||||
<power_readings>
|
||||
<power_state>P0</power_state>
|
||||
<power_management>Supported</power_management>
|
||||
<power_draw>24.84 W</power_draw>
|
||||
<power_limit>250.00 W</power_limit>
|
||||
<default_power_limit>250.00 W</default_power_limit>
|
||||
<enforced_power_limit>250.00 W</enforced_power_limit>
|
||||
<min_power_limit>125.00 W</min_power_limit>
|
||||
<max_power_limit>250.00 W</max_power_limit>
|
||||
</power_readings>
|
||||
<clocks>
|
||||
<graphics_clock>405 MHz</graphics_clock>
|
||||
<sm_clock>405 MHz</sm_clock>
|
||||
<mem_clock>715 MHz</mem_clock>
|
||||
<video_clock>835 MHz</video_clock>
|
||||
</clocks>
|
||||
<applications_clocks>
|
||||
<graphics_clock>1189 MHz</graphics_clock>
|
||||
<mem_clock>715 MHz</mem_clock>
|
||||
</applications_clocks>
|
||||
<default_applications_clocks>
|
||||
<graphics_clock>1189 MHz</graphics_clock>
|
||||
<mem_clock>715 MHz</mem_clock>
|
||||
</default_applications_clocks>
|
||||
<max_clocks>
|
||||
<graphics_clock>1328 MHz</graphics_clock>
|
||||
<sm_clock>1328 MHz</sm_clock>
|
||||
<mem_clock>715 MHz</mem_clock>
|
||||
<video_clock>1328 MHz</video_clock>
|
||||
</max_clocks>
|
||||
<clock_policy>
|
||||
<auto_boost>N/A</auto_boost>
|
||||
<auto_boost_default>N/A</auto_boost_default>
|
||||
</clock_policy>
|
||||
<supported_clocks>
|
||||
<supported_mem_clock>
|
||||
<value>715 MHz</value>
|
||||
<supported_graphics_clock>1328 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1316 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1303 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1290 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1278 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1265 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1252 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1240 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1227 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1215 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1202 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1189 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1177 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1164 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1151 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1139 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1126 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1113 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1101 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1088 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1075 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1063 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1050 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1037 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1025 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1012 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>999 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>987 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>974 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>961 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>949 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>936 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>923 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>911 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>898 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>885 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>873 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>860 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>847 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>835 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>822 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>810 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>797 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>784 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>772 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>759 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>746 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>734 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>721 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>708 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>696 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>683 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>670 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>658 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>645 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>632 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>620 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>607 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>594 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>582 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>569 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>556 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>544 MHz</supported_graphics_clock>
|
||||
</supported_mem_clock>
|
||||
</supported_clocks>
|
||||
<processes>
|
||||
</processes>
|
||||
<accounted_processes>
|
||||
</accounted_processes>
|
||||
</gpu>
|
||||
|
||||
<gpu id="0000:82:00.0">
|
||||
<product_name>Tesla P100-PCIE-12GB</product_name>
|
||||
<product_brand>Tesla</product_brand>
|
||||
<display_mode>Disabled</display_mode>
|
||||
<display_active>Disabled</display_active>
|
||||
<persistence_mode>Disabled</persistence_mode>
|
||||
<accounting_mode>Disabled</accounting_mode>
|
||||
<accounting_mode_buffer_size>1920</accounting_mode_buffer_size>
|
||||
<driver_model>
|
||||
<current_dm>N/A</current_dm>
|
||||
<pending_dm>N/A</pending_dm>
|
||||
</driver_model>
|
||||
<serial>0320717031755</serial>
|
||||
<uuid>GPU-46915a82-3fd2-8e11-ae26-a80b607c04f3</uuid>
|
||||
<minor_number>1</minor_number>
|
||||
<vbios_version>86.00.3A.00.02</vbios_version>
|
||||
<multigpu_board>No</multigpu_board>
|
||||
<board_id>0x8200</board_id>
|
||||
<gpu_part_number>900-2H400-0110-030</gpu_part_number>
|
||||
<inforom_version>
|
||||
<img_version>H400.0202.00.01</img_version>
|
||||
<oem_object>1.1</oem_object>
|
||||
<ecc_object>4.1</ecc_object>
|
||||
<pwr_object>N/A</pwr_object>
|
||||
</inforom_version>
|
||||
<gpu_operation_mode>
|
||||
<current_gom>N/A</current_gom>
|
||||
<pending_gom>N/A</pending_gom>
|
||||
</gpu_operation_mode>
|
||||
<gpu_virtualization_mode>
|
||||
<virtualization_mode>None</virtualization_mode>
|
||||
</gpu_virtualization_mode>
|
||||
<pci>
|
||||
<pci_bus>82</pci_bus>
|
||||
<pci_device>00</pci_device>
|
||||
<pci_domain>0000</pci_domain>
|
||||
<pci_device_id>15F710DE</pci_device_id>
|
||||
<pci_bus_id>0000:82:00.0</pci_bus_id>
|
||||
<pci_sub_system_id>11DA10DE</pci_sub_system_id>
|
||||
<pci_gpu_link_info>
|
||||
<pcie_gen>
|
||||
<max_link_gen>3</max_link_gen>
|
||||
<current_link_gen>3</current_link_gen>
|
||||
</pcie_gen>
|
||||
<link_widths>
|
||||
<max_link_width>16x</max_link_width>
|
||||
<current_link_width>16x</current_link_width>
|
||||
</link_widths>
|
||||
</pci_gpu_link_info>
|
||||
<pci_bridge_chip>
|
||||
<bridge_chip_type>N/A</bridge_chip_type>
|
||||
<bridge_chip_fw>N/A</bridge_chip_fw>
|
||||
</pci_bridge_chip>
|
||||
<replay_counter>0</replay_counter>
|
||||
<tx_util>0 KB/s</tx_util>
|
||||
<rx_util>0 KB/s</rx_util>
|
||||
</pci>
|
||||
<fan_speed>N/A</fan_speed>
|
||||
<performance_state>P0</performance_state>
|
||||
<clocks_throttle_reasons>
|
||||
<clocks_throttle_reason_gpu_idle>Active</clocks_throttle_reason_gpu_idle>
|
||||
<clocks_throttle_reason_applications_clocks_setting>Not Active</clocks_throttle_reason_applications_clocks_setting>
|
||||
<clocks_throttle_reason_sw_power_cap>Not Active</clocks_throttle_reason_sw_power_cap>
|
||||
<clocks_throttle_reason_hw_slowdown>Not Active</clocks_throttle_reason_hw_slowdown>
|
||||
<clocks_throttle_reason_sync_boost>Not Active</clocks_throttle_reason_sync_boost>
|
||||
<clocks_throttle_reason_unknown>Not Active</clocks_throttle_reason_unknown>
|
||||
</clocks_throttle_reasons>
|
||||
<fb_memory_usage>
|
||||
<total>12193 MiB</total>
|
||||
<used>0 MiB</used>
|
||||
<free>12193 MiB</free>
|
||||
</fb_memory_usage>
|
||||
<bar1_memory_usage>
|
||||
<total>16384 MiB</total>
|
||||
<used>2 MiB</used>
|
||||
<free>16382 MiB</free>
|
||||
</bar1_memory_usage>
|
||||
<compute_mode>Default</compute_mode>
|
||||
<utilization>
|
||||
<gpu_util>10.3 %</gpu_util>
|
||||
<memory_util>0 %</memory_util>
|
||||
<encoder_util>0 %</encoder_util>
|
||||
<decoder_util>0 %</decoder_util>
|
||||
</utilization>
|
||||
<encoder_stats>
|
||||
<session_count>0</session_count>
|
||||
<average_fps>0</average_fps>
|
||||
<average_latency>0 ms</average_latency>
|
||||
</encoder_stats>
|
||||
<ecc_mode>
|
||||
<current_ecc>Enabled</current_ecc>
|
||||
<pending_ecc>Enabled</pending_ecc>
|
||||
</ecc_mode>
|
||||
<ecc_errors>
|
||||
<volatile>
|
||||
<single_bit>
|
||||
<device_memory>0</device_memory>
|
||||
<register_file>0</register_file>
|
||||
<l1_cache>N/A</l1_cache>
|
||||
<l2_cache>0</l2_cache>
|
||||
<texture_memory>0</texture_memory>
|
||||
<texture_shm>0</texture_shm>
|
||||
<total>0</total>
|
||||
</single_bit>
|
||||
<double_bit>
|
||||
<device_memory>0</device_memory>
|
||||
<register_file>0</register_file>
|
||||
<l1_cache>N/A</l1_cache>
|
||||
<l2_cache>0</l2_cache>
|
||||
<texture_memory>0</texture_memory>
|
||||
<texture_shm>0</texture_shm>
|
||||
<total>0</total>
|
||||
</double_bit>
|
||||
</volatile>
|
||||
<aggregate>
|
||||
<single_bit>
|
||||
<device_memory>0</device_memory>
|
||||
<register_file>0</register_file>
|
||||
<l1_cache>N/A</l1_cache>
|
||||
<l2_cache>0</l2_cache>
|
||||
<texture_memory>0</texture_memory>
|
||||
<texture_shm>0</texture_shm>
|
||||
<total>0</total>
|
||||
</single_bit>
|
||||
<double_bit>
|
||||
<device_memory>0</device_memory>
|
||||
<register_file>0</register_file>
|
||||
<l1_cache>N/A</l1_cache>
|
||||
<l2_cache>0</l2_cache>
|
||||
<texture_memory>0</texture_memory>
|
||||
<texture_shm>0</texture_shm>
|
||||
<total>0</total>
|
||||
</double_bit>
|
||||
</aggregate>
|
||||
</ecc_errors>
|
||||
<retired_pages>
|
||||
<multiple_single_bit_retirement>
|
||||
<retired_count>0</retired_count>
|
||||
<retired_page_addresses>
|
||||
</retired_page_addresses>
|
||||
</multiple_single_bit_retirement>
|
||||
<double_bit_retirement>
|
||||
<retired_count>0</retired_count>
|
||||
<retired_page_addresses>
|
||||
</retired_page_addresses>
|
||||
</double_bit_retirement>
|
||||
<pending_retirement>No</pending_retirement>
|
||||
</retired_pages>
|
||||
<temperature>
|
||||
<gpu_temp>34 C</gpu_temp>
|
||||
<gpu_temp_max_threshold>85 C</gpu_temp_max_threshold>
|
||||
<gpu_temp_slow_threshold>82 C</gpu_temp_slow_threshold>
|
||||
</temperature>
|
||||
<power_readings>
|
||||
<power_state>P0</power_state>
|
||||
<power_management>Supported</power_management>
|
||||
<power_draw>25.54 W</power_draw>
|
||||
<power_limit>250.00 W</power_limit>
|
||||
<default_power_limit>250.00 W</default_power_limit>
|
||||
<enforced_power_limit>250.00 W</enforced_power_limit>
|
||||
<min_power_limit>125.00 W</min_power_limit>
|
||||
<max_power_limit>250.00 W</max_power_limit>
|
||||
</power_readings>
|
||||
<clocks>
|
||||
<graphics_clock>405 MHz</graphics_clock>
|
||||
<sm_clock>405 MHz</sm_clock>
|
||||
<mem_clock>715 MHz</mem_clock>
|
||||
<video_clock>835 MHz</video_clock>
|
||||
</clocks>
|
||||
<applications_clocks>
|
||||
<graphics_clock>1189 MHz</graphics_clock>
|
||||
<mem_clock>715 MHz</mem_clock>
|
||||
</applications_clocks>
|
||||
<default_applications_clocks>
|
||||
<graphics_clock>1189 MHz</graphics_clock>
|
||||
<mem_clock>715 MHz</mem_clock>
|
||||
</default_applications_clocks>
|
||||
<max_clocks>
|
||||
<graphics_clock>1328 MHz</graphics_clock>
|
||||
<sm_clock>1328 MHz</sm_clock>
|
||||
<mem_clock>715 MHz</mem_clock>
|
||||
<video_clock>1328 MHz</video_clock>
|
||||
</max_clocks>
|
||||
<clock_policy>
|
||||
<auto_boost>N/A</auto_boost>
|
||||
<auto_boost_default>N/A</auto_boost_default>
|
||||
</clock_policy>
|
||||
<supported_clocks>
|
||||
<supported_mem_clock>
|
||||
<value>715 MHz</value>
|
||||
<supported_graphics_clock>1328 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1316 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1303 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1290 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1278 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1265 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1252 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1240 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1227 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1215 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1202 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1189 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1177 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1164 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1151 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1139 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1126 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1113 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1101 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1088 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1075 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1063 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1050 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1037 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1025 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1012 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>999 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>987 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>974 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>961 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>949 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>936 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>923 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>911 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>898 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>885 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>873 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>860 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>847 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>835 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>822 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>810 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>797 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>784 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>772 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>759 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>746 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>734 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>721 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>708 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>696 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>683 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>670 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>658 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>645 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>632 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>620 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>607 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>594 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>582 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>569 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>556 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>544 MHz</supported_graphics_clock>
|
||||
</supported_mem_clock>
|
||||
</supported_clocks>
|
||||
<processes>
|
||||
</processes>
|
||||
<accounted_processes>
|
||||
</accounted_processes>
|
||||
</gpu>
|
||||
|
||||
</nvidia_smi_log>
|
Loading…
Reference in New Issue