YARN-6620. Add support in NodeManager to isolate GPU devices by using CGroups. Contributed by Wangda Tan.

This commit is contained in:
Sunil G 2017-10-11 23:44:33 +05:30
parent 3de574413c
commit fa5cfc68f3
42 changed files with 3364 additions and 204 deletions

View File

@ -18,11 +18,14 @@
package org.apache.hadoop.yarn.api.records;
import com.google.common.collect.ImmutableMap;
import org.apache.curator.shaded.com.google.common.reflect.ClassPath;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
import org.apache.hadoop.yarn.util.UnitsConversionUtil;
import java.util.Map;
/**
* Class to encapsulate information about a Resource - the name of the resource,
* the units(milli, micro, etc), the type(countable), and the value.
@ -36,13 +39,20 @@ public class ResourceInformation implements Comparable<ResourceInformation> {
private long minimumAllocation;
private long maximumAllocation;
// Known resource types
public static final String MEMORY_URI = "memory-mb";
public static final String VCORES_URI = "vcores";
public static final String GPU_URI = "yarn.io/gpu";
public static final ResourceInformation MEMORY_MB =
ResourceInformation.newInstance(MEMORY_URI, "Mi");
public static final ResourceInformation VCORES =
ResourceInformation.newInstance(VCORES_URI);
public static final ResourceInformation GPUS =
ResourceInformation.newInstance(GPU_URI);
public static final Map<String, ResourceInformation> MANDATORY_RESOURCES =
ImmutableMap.of(MEMORY_URI, MEMORY_MB, VCORES_URI, VCORES, GPU_URI, GPUS);
/**
* Get the name for the resource.

View File

@ -1433,6 +1433,39 @@ public class YarnConfiguration extends Configuration {
public static final String NM_NETWORK_RESOURCE_OUTBOUND_BANDWIDTH_YARN_MBIT =
NM_NETWORK_RESOURCE_PREFIX + "outbound-bandwidth-yarn-mbit";
/**
* Prefix for computation resources, example of computation resources like
* GPU / FPGA / TPU, etc.
*/
@Private
public static final String NM_RESOURCE_PLUGINS =
NM_PREFIX + "resource-plugins";
/**
* Prefix for gpu configurations. Work in progress: This configuration
* parameter may be changed/removed in the future.
*/
@Private
public static final String NM_GPU_RESOURCE_PREFIX =
NM_RESOURCE_PLUGINS + ".gpu.";
@Private
public static final String NM_GPU_ALLOWED_DEVICES =
NM_GPU_RESOURCE_PREFIX + "allowed-gpu-devices";
@Private
public static final String AUTOMATICALLY_DISCOVER_GPU_DEVICES = "auto";
/**
* This setting controls where to how to invoke GPU binaries
*/
@Private
public static final String NM_GPU_PATH_TO_EXEC =
NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
@Private
public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = "";
/** NM Webapp address.**/
public static final String NM_WEBAPP_ADDRESS = NM_PREFIX + "webapp.address";
public static final int DEFAULT_NM_WEBAPP_PORT = 8042;

View File

@ -46,6 +46,8 @@ import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
/**
* Helper class to read the resource-types to be supported by the system.
*/
@ -82,33 +84,32 @@ public class ResourceUtils {
*/
String key = "memory";
if (resourceInformationMap.containsKey(key)) {
LOG.warn("Attempt to define resource '" + key +
"', but it is not allowed.");
throw new YarnRuntimeException("Attempt to re-define mandatory resource '"
+ key + "'.");
LOG.warn(
"Attempt to define resource '" + key + "', but it is not allowed.");
throw new YarnRuntimeException(
"Attempt to re-define mandatory resource '" + key + "'.");
}
if (resourceInformationMap.containsKey(MEMORY)) {
ResourceInformation memInfo = resourceInformationMap.get(MEMORY);
String memUnits = ResourceInformation.MEMORY_MB.getUnits();
ResourceTypes memType = ResourceInformation.MEMORY_MB.getResourceType();
if (!memInfo.getUnits().equals(memUnits) || !memInfo.getResourceType()
.equals(memType)) {
throw new YarnRuntimeException(
"Attempt to re-define mandatory resource 'memory-mb'. It can only"
+ " be of type 'COUNTABLE' and have units 'Mi'.");
}
}
for (Map.Entry<String, ResourceInformation> mandatoryResourceEntry :
ResourceInformation.MANDATORY_RESOURCES.entrySet()) {
key = mandatoryResourceEntry.getKey();
ResourceInformation mandatoryRI = mandatoryResourceEntry.getValue();
if (resourceInformationMap.containsKey(VCORES)) {
ResourceInformation vcoreInfo = resourceInformationMap.get(VCORES);
String vcoreUnits = ResourceInformation.VCORES.getUnits();
ResourceTypes vcoreType = ResourceInformation.VCORES.getResourceType();
if (!vcoreInfo.getUnits().equals(vcoreUnits) || !vcoreInfo
.getResourceType().equals(vcoreType)) {
throw new YarnRuntimeException(
"Attempt to re-define mandatory resource 'vcores'. It can only be"
+ " of type 'COUNTABLE' and have units ''(no units).");
ResourceInformation newDefinedRI = resourceInformationMap.get(key);
if (newDefinedRI != null) {
String expectedUnit = mandatoryRI.getUnits();
ResourceTypes expectedType = mandatoryRI.getResourceType();
String actualUnit = newDefinedRI.getUnits();
ResourceTypes actualType = newDefinedRI.getResourceType();
if (!expectedUnit.equals(actualUnit) || !expectedType.equals(
actualType)) {
throw new YarnRuntimeException("Defined mandatory resource type="
+ key + " inside resource-types.xml, however its type or "
+ "unit is conflict to mandatory resource types, expected type="
+ expectedType + ", unit=" + expectedUnit + "; actual type="
+ actualType + " actual unit=" + actualUnit);
}
}
}
}

View File

@ -3347,7 +3347,6 @@
<value>false</value>
</property>
<!-- resource types configuration -->
<property>
<name>yarn.resource-types</name>
@ -3431,4 +3430,45 @@
<name>yarn.scheduler.configuration.zk-store.parent-path</name>
<value>/confstore</value>
</property>
<property>
<description>
When yarn.nodemanager.resource.gpu.allowed-gpu-devices=auto specified,
YARN NodeManager needs to run GPU discovery binary (now only support
nvidia-smi) to get GPU-related information.
When value is empty (default), YARN NodeManager will try to locate
discovery executable itself.
An example of the config value is: /usr/local/bin/nvidia-smi
</description>
<name>yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables</name>
<value></value>
</property>
<property>
<description>
Enable additional discovery/isolation of resources on the NodeManager,
split by comma. By default, this is empty. Acceptable values: { "yarn-io/gpu" }.
</description>
<name>yarn.nodemanager.resource-plugins</name>
<value></value>
</property>
<property>
<description>
Specify GPU devices which can be managed by YARN NodeManager, split by comma
Number of GPU devices will be reported to RM to make scheduling decisions.
Set to auto (default) let YARN automatically discover GPU resource from
system.
Manually specify GPU devices if auto detect GPU device failed or admin
only want subset of GPU devices managed by YARN. GPU device is identified
by their minor device number. A common approach to get minor device number
of GPUs is using "nvidia-smi -q" and search "Minor Number" output. An
example of manual specification is "0,1,2,4" to allow YARN NodeManager
to manage GPU devices with minor number 0/1/2/4.
</description>
<name>yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices</name>
<value>auto</value>
</property>
</configuration>

View File

@ -50,6 +50,23 @@ public class TestResourceUtils {
}
}
public static void addNewTypesToResources(String... resourceTypes) {
// Initialize resource map
Map<String, ResourceInformation> riMap = new HashMap<>();
// Initialize mandatory resources
riMap.put(ResourceInformation.MEMORY_URI, ResourceInformation.MEMORY_MB);
riMap.put(ResourceInformation.VCORES_URI, ResourceInformation.VCORES);
for (String newResource : resourceTypes) {
riMap.put(newResource, ResourceInformation
.newInstance(newResource, "", 0, ResourceTypes.COUNTABLE, 0,
Integer.MAX_VALUE));
}
ResourceUtils.initializeResourcesFromResourceInformationMap(riMap);
}
@Before
public void setup() {
ResourceUtils.resetResourceTypes();

View File

@ -113,9 +113,10 @@ public abstract class ContainerExecutor implements Configurable {
* Run the executor initialization steps.
* Verify that the necessary configs and permissions are in place.
*
* @param nmContext Context of NM
* @throws IOException if initialization fails
*/
public abstract void init() throws IOException;
public abstract void init(Context nmContext) throws IOException;
/**
* This function localizes the JAR file on-demand.

View File

@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManag
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
@ -122,4 +123,6 @@ public interface Context {
ContainerExecutor getContainerExecutor();
ContainerStateTransitionListener getContainerStateTransitionListener();
ResourcePluginManager getResourcePluginManager();
}

View File

@ -135,7 +135,7 @@ public class DefaultContainerExecutor extends ContainerExecutor {
}
@Override
public void init() throws IOException {
public void init(Context nmContext) throws IOException {
// nothing to do or verify here
}

View File

@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@ -281,7 +282,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
}
@Override
public void init() throws IOException {
public void init(Context nmContext) throws IOException {
Configuration conf = super.getConf();
// Send command to executor which will just start up,
@ -305,7 +306,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
try {
resourceHandlerChain = ResourceHandlerModule
.getConfiguredResourceHandlerChain(conf);
.getConfiguredResourceHandlerChain(conf, nmContext);
if (LOG.isDebugEnabled()) {
LOG.debug("Resource handler chain enabled = " + (resourceHandlerChain
!= null));
@ -844,4 +845,9 @@ public class LinuxContainerExecutor extends ContainerExecutor {
e);
}
}
@VisibleForTesting
public ResourceHandler getResourceHandler() {
return resourceHandlerChain;
}
}

View File

@ -18,23 +18,7 @@
package org.apache.hadoop.yarn.server.nodemanager;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import org.apache.hadoop.yarn.state.MultiStateTransitionListener;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@ -65,12 +49,16 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.collectormanager.NMCollectorService;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider;
@ -78,14 +66,25 @@ import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ScriptBasedNodeLabel
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer;
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.state.MultiStateTransitionListener;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicBoolean;
public class NodeManager extends CompositeService
implements EventHandler<NodeManagerEvent> {
@ -331,6 +330,18 @@ public class NodeManager extends CompositeService
nmCheckintervalTime, scriptTimeout, scriptArgs);
}
@VisibleForTesting
protected ResourcePluginManager createResourcePluginManager() {
return new ResourcePluginManager();
}
@VisibleForTesting
protected ContainerExecutor createContainerExecutor(Configuration conf) {
return ReflectionUtils.newInstance(
conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR,
DefaultContainerExecutor.class, ContainerExecutor.class), conf);
}
@Override
protected void serviceInit(Configuration conf) throws Exception {
rmWorkPreservingRestartEnabled = conf.getBoolean(YarnConfiguration
@ -356,11 +367,20 @@ public class NodeManager extends CompositeService
this.aclsManager = new ApplicationACLsManager(conf);
ContainerExecutor exec = ReflectionUtils.newInstance(
conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR,
DefaultContainerExecutor.class, ContainerExecutor.class), conf);
boolean isDistSchedulingEnabled =
conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED,
YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED);
this.context = createNMContext(containerTokenSecretManager,
nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf);
ResourcePluginManager pluginManager = createResourcePluginManager();
pluginManager.initialize(context);
((NMContext)context).setResourcePluginManager(pluginManager);
ContainerExecutor exec = createContainerExecutor(conf);
try {
exec.init();
exec.init(context);
} catch (IOException e) {
throw new YarnRuntimeException("Failed to initialize container executor", e);
}
@ -376,13 +396,6 @@ public class NodeManager extends CompositeService
getNodeHealthScriptRunner(conf), dirsHandler);
addService(nodeHealthChecker);
boolean isDistSchedulingEnabled =
conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED,
YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED);
this.context = createNMContext(containerTokenSecretManager,
nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf);
((NMContext)context).setContainerExecutor(exec);
@ -459,6 +472,12 @@ public class NodeManager extends CompositeService
try {
super.serviceStop();
DefaultMetricsSystem.shutdown();
// Cleanup ResourcePluginManager
ResourcePluginManager rpm = context.getResourcePluginManager();
if (rpm != null) {
rpm.cleanup();
}
} finally {
// YARN-3641: NM's services stop get failed shouldn't block the
// release of NMLevelDBStore.
@ -596,6 +615,8 @@ public class NodeManager extends CompositeService
private ContainerStateTransitionListener containerStateTransitionListener;
private ResourcePluginManager resourcePluginManager;
public NMContext(NMContainerTokenSecretManager containerTokenSecretManager,
NMTokenSecretManagerInNM nmTokenSecretManager,
LocalDirsHandlerService dirsHandler, ApplicationACLsManager aclsManager,
@ -796,6 +817,15 @@ public class NodeManager extends CompositeService
ContainerStateTransitionListener transitionListener) {
this.containerStateTransitionListener = transitionListener;
}
public ResourcePluginManager getResourcePluginManager() {
return resourcePluginManager;
}
public void setResourcePluginManager(
ResourcePluginManager resourcePluginManager) {
this.resourcePluginManager = resourcePluginManager;
}
}
/**

View File

@ -33,6 +33,9 @@ import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -178,14 +181,15 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
long memoryMb = totalResource.getMemorySize();
float vMemToPMem =
conf.getFloat(
YarnConfiguration.NM_VMEM_PMEM_RATIO,
YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
YarnConfiguration.NM_VMEM_PMEM_RATIO,
YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
long virtualMemoryMb = (long)Math.ceil(memoryMb * vMemToPMem);
int virtualCores = totalResource.getVirtualCores();
LOG.info("Nodemanager resources: memory set to " + memoryMb + "MB.");
LOG.info("Nodemanager resources: vcores set to " + virtualCores + ".");
LOG.info("Nodemanager resources: " + totalResource);
// Update configured resources via plugins.
updateConfiguredResourcesViaPlugins(totalResource);
LOG.info("Nodemanager resources is set to: " + totalResource);
metrics.addResource(totalResource);
@ -342,12 +346,27 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
return ServerRMProxy.createRMProxy(conf, ResourceTracker.class);
}
private void updateConfiguredResourcesViaPlugins(
Resource configuredResource) throws YarnException {
ResourcePluginManager pluginManager = context.getResourcePluginManager();
if (pluginManager != null && pluginManager.getNameToPlugins() != null) {
// Update configured resource
for (ResourcePlugin resourcePlugin : pluginManager.getNameToPlugins()
.values()) {
if (resourcePlugin.getNodeResourceHandlerInstance() != null) {
resourcePlugin.getNodeResourceHandlerInstance()
.updateConfiguredResource(configuredResource);
}
}
}
}
@VisibleForTesting
protected void registerWithRM()
throws YarnException, IOException {
RegisterNodeManagerResponse regNMResponse;
Set<NodeLabel> nodeLabels = nodeLabelsHandler.getNodeLabelsForRegistration();
// Synchronize NM-RM registration with
// ContainerManagerImpl#increaseContainersResource and
// ContainerManagerImpl#startContainers to avoid race condition
@ -358,6 +377,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
RegisterNodeManagerRequest.newInstance(nodeId, httpPort, totalResource,
nodeManagerVersionId, containerReports, getRunningApplications(),
nodeLabels, physicalResource);
if (containerReports != null) {
LOG.info("Registering with RM using containers :" + containerReports);
}
@ -406,7 +426,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
if (masterKey != null) {
this.context.getContainerTokenSecretManager().setMasterKey(masterKey);
}
masterKey = regNMResponse.getNMTokenMasterKey();
if (masterKey != null) {
this.context.getNMTokenSecretManager().setMasterKey(masterKey);
@ -732,7 +752,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
}
}
}
@Override
public long getRMIdentifier() {
return this.rmIdentifier;

View File

@ -51,6 +51,7 @@ public class PrivilegedOperation {
TC_READ_STATS("--tc-read-stats"),
ADD_PID_TO_CGROUP(""), //no CLI switch supported yet.
RUN_DOCKER_CMD("--run-docker"),
GPU("--module-gpu"),
LIST_AS_USER(""); //no CLI switch supported yet.
private final String option;

View File

@ -20,6 +20,7 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
@ -135,7 +136,8 @@ public class ResourceHandlerChain implements ResourceHandler {
return allOperations;
}
List<ResourceHandler> getResourceHandlerList() {
@VisibleForTesting
public List<ResourceHandler> getResourceHandlerList() {
return Collections.unmodifiableList(resourceHandlers);
}

View File

@ -21,25 +21,28 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.util.CgroupsLCEResourcesHandler;
import org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.Set;
import java.util.HashSet;
import java.util.Map;
import java.util.HashMap;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Provides mechanisms to get various resource handlers - cpu, memory, network,
@ -206,22 +209,41 @@ public class ResourceHandlerModule {
}
private static void initializeConfiguredResourceHandlerChain(
Configuration conf) throws ResourceHandlerException {
Configuration conf, Context nmContext)
throws ResourceHandlerException {
ArrayList<ResourceHandler> handlerList = new ArrayList<>();
addHandlerIfNotNull(handlerList, getOutboundBandwidthResourceHandler(conf));
addHandlerIfNotNull(handlerList, getDiskResourceHandler(conf));
addHandlerIfNotNull(handlerList, getMemoryResourceHandler(conf));
addHandlerIfNotNull(handlerList, getCGroupsCpuResourceHandler(conf));
addHandlersFromConfiguredResourcePlugins(handlerList, conf, nmContext);
resourceHandlerChain = new ResourceHandlerChain(handlerList);
}
private static void addHandlersFromConfiguredResourcePlugins(
List<ResourceHandler> handlerList, Configuration conf,
Context nmContext) throws ResourceHandlerException {
ResourcePluginManager pluginManager = nmContext.getResourcePluginManager();
if (pluginManager != null) {
Map<String, ResourcePlugin> pluginMap = pluginManager.getNameToPlugins();
if (pluginMap != null) {
for (ResourcePlugin plugin : pluginMap.values()) {
addHandlerIfNotNull(handlerList, plugin
.createResourceHandler(nmContext,
getInitializedCGroupsHandler(conf),
PrivilegedOperationExecutor.getInstance(conf)));
}
}
}
}
public static ResourceHandlerChain getConfiguredResourceHandlerChain(
Configuration conf) throws ResourceHandlerException {
Configuration conf, Context nmContext) throws ResourceHandlerException {
if (resourceHandlerChain == null) {
synchronized (ResourceHandlerModule.class) {
if (resourceHandlerChain == null) {
initializeConfiguredResourceHandlerChain(conf);
initializeConfiguredResourceHandlerChain(conf, nmContext);
}
}
}

View File

@ -0,0 +1,242 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
/**
* Allocate GPU resources according to requirements
*/
public class GpuResourceAllocator {
final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class);
private Set<Integer> allowedGpuDevices = new TreeSet<>();
private Map<Integer, ContainerId> usedDevices = new TreeMap<>();
private Context nmContext;
public GpuResourceAllocator(Context ctx) {
this.nmContext = ctx;
}
/**
* Contains allowed and denied devices with minor number.
* Denied devices will be useful for cgroups devices module to do blacklisting
*/
static class GpuAllocation {
private Set<Integer> allowed = Collections.emptySet();
private Set<Integer> denied = Collections.emptySet();
GpuAllocation(Set<Integer> allowed, Set<Integer> denied) {
if (allowed != null) {
this.allowed = ImmutableSet.copyOf(allowed);
}
if (denied != null) {
this.denied = ImmutableSet.copyOf(denied);
}
}
public Set<Integer> getAllowedGPUs() {
return allowed;
}
public Set<Integer> getDeniedGPUs() {
return denied;
}
}
/**
* Add GPU to allowed list
* @param minorNumber minor number of the GPU device.
*/
public synchronized void addGpu(int minorNumber) {
allowedGpuDevices.add(minorNumber);
}
private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices,
ContainerId containerId) {
return "Failed to find enough GPUs, requestor=" + containerId
+ ", #RequestedGPUs=" + numRequestedGpuDevices + ", #availableGpus="
+ getAvailableGpus();
}
@VisibleForTesting
public synchronized int getAvailableGpus() {
return allowedGpuDevices.size() - usedDevices.size();
}
public synchronized void recoverAssignedGpus(ContainerId containerId)
throws ResourceHandlerException {
Container c = nmContext.getContainers().get(containerId);
if (null == c) {
throw new ResourceHandlerException(
"This shouldn't happen, cannot find container with id="
+ containerId);
}
for (Serializable deviceId : c.getResourceMappings().getAssignedResources(
GPU_URI)){
if (!(deviceId instanceof String)) {
throw new ResourceHandlerException(
"Trying to recover device id, however it"
+ " is not String, this shouldn't happen");
}
int devId;
try {
devId = Integer.parseInt((String)deviceId);
} catch (NumberFormatException e) {
throw new ResourceHandlerException("Failed to recover device id because"
+ "it is not a valid integer, devId:" + deviceId);
}
// Make sure it is in allowed GPU device.
if (!allowedGpuDevices.contains(devId)) {
throw new ResourceHandlerException("Try to recover device id = " + devId
+ " however it is not in allowed device list:" + StringUtils
.join(",", allowedGpuDevices));
}
// Make sure it is not occupied by anybody else
if (usedDevices.containsKey(devId)) {
throw new ResourceHandlerException("Try to recover device id = " + devId
+ " however it is already assigned to container=" + usedDevices
.get(devId) + ", please double check what happened.");
}
usedDevices.put(devId, containerId);
}
}
private int getRequestedGpus(Resource requestedResource) {
try {
return Long.valueOf(requestedResource.getResourceValue(
GPU_URI)).intValue();
} catch (ResourceNotFoundException e) {
return 0;
}
}
/**
* Assign GPU to requestor
* @param container container to allocate
* @return List of denied Gpus with minor numbers
* @throws ResourceHandlerException When failed to
*/
public synchronized GpuAllocation assignGpus(Container container)
throws ResourceHandlerException {
Resource requestedResource = container.getResource();
ContainerId containerId = container.getContainerId();
int numRequestedGpuDevices = getRequestedGpus(requestedResource);
// Assign Gpus to container if requested some.
if (numRequestedGpuDevices > 0) {
if (numRequestedGpuDevices > getAvailableGpus()) {
throw new ResourceHandlerException(
getResourceHandlerExceptionMessage(numRequestedGpuDevices,
containerId));
}
Set<Integer> assignedGpus = new HashSet<>();
for (int deviceNum : allowedGpuDevices) {
if (!usedDevices.containsKey(deviceNum)) {
usedDevices.put(deviceNum, containerId);
assignedGpus.add(deviceNum);
if (assignedGpus.size() == numRequestedGpuDevices) {
break;
}
}
}
// Record in state store if we allocated anything
if (!assignedGpus.isEmpty()) {
List<Serializable> allocatedDevices = new ArrayList<>();
for (int gpu : assignedGpus) {
allocatedDevices.add(String.valueOf(gpu));
}
try {
// Update Container#getResourceMapping.
ResourceMappings.AssignedResources assignedResources =
new ResourceMappings.AssignedResources();
assignedResources.updateAssignedResources(allocatedDevices);
container.getResourceMappings().addAssignedResources(GPU_URI,
assignedResources);
// Update state store.
nmContext.getNMStateStore().storeAssignedResources(containerId,
GPU_URI, allocatedDevices);
} catch (IOException e) {
cleanupAssignGpus(containerId);
throw new ResourceHandlerException(e);
}
}
return new GpuAllocation(assignedGpus,
Sets.difference(allowedGpuDevices, assignedGpus));
}
return new GpuAllocation(null, allowedGpuDevices);
}
/**
* Clean up all Gpus assigned to containerId
* @param containerId containerId
*/
public synchronized void cleanupAssignGpus(ContainerId containerId) {
Iterator<Map.Entry<Integer, ContainerId>> iter =
usedDevices.entrySet().iterator();
while (iter.hasNext()) {
if (iter.next().getValue().equals(containerId)) {
iter.remove();
}
}
}
@VisibleForTesting
public synchronized Map<Integer, ContainerId> getDeviceAllocationMapping() {
return new HashMap<>(usedDevices);
}
}

View File

@ -0,0 +1,153 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class GpuResourceHandlerImpl implements ResourceHandler {
final static Log LOG = LogFactory
.getLog(GpuResourceHandlerImpl.class);
// This will be used by container-executor to add necessary clis
public static final String EXCLUDED_GPUS_CLI_OPTION = "--excluded_gpus";
public static final String CONTAINER_ID_CLI_OPTION = "--container_id";
private GpuResourceAllocator gpuAllocator;
private CGroupsHandler cGroupsHandler;
private PrivilegedOperationExecutor privilegedOperationExecutor;
public GpuResourceHandlerImpl(Context nmContext,
CGroupsHandler cGroupsHandler,
PrivilegedOperationExecutor privilegedOperationExecutor) {
this.cGroupsHandler = cGroupsHandler;
this.privilegedOperationExecutor = privilegedOperationExecutor;
gpuAllocator = new GpuResourceAllocator(nmContext);
}
@Override
public List<PrivilegedOperation> bootstrap(Configuration configuration)
throws ResourceHandlerException {
List<Integer> minorNumbersOfUsableGpus;
try {
minorNumbersOfUsableGpus = GpuDiscoverer.getInstance()
.getMinorNumbersOfGpusUsableByYarn();
} catch (YarnException e) {
LOG.error("Exception when trying to get usable GPU device", e);
throw new ResourceHandlerException(e);
}
for (int minorNumber : minorNumbersOfUsableGpus) {
gpuAllocator.addGpu(minorNumber);
}
// And initialize cgroups
this.cGroupsHandler.initializeCGroupController(
CGroupsHandler.CGroupController.DEVICES);
return null;
}
@Override
public synchronized List<PrivilegedOperation> preStart(Container container)
throws ResourceHandlerException {
String containerIdStr = container.getContainerId().toString();
// Assign Gpus to container if requested some.
GpuResourceAllocator.GpuAllocation allocation = gpuAllocator.assignGpus(
container);
// Create device cgroups for the container
cGroupsHandler.createCGroup(CGroupsHandler.CGroupController.DEVICES,
containerIdStr);
try {
// Execute c-e to setup GPU isolation before launch the container
PrivilegedOperation privilegedOperation = new PrivilegedOperation(
PrivilegedOperation.OperationType.GPU, Arrays
.asList(CONTAINER_ID_CLI_OPTION, containerIdStr));
if (!allocation.getDeniedGPUs().isEmpty()) {
privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION,
StringUtils.join(",", allocation.getDeniedGPUs())));
}
privilegedOperationExecutor.executePrivilegedOperation(
privilegedOperation, true);
} catch (PrivilegedOperationException e) {
cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES,
containerIdStr);
LOG.warn("Could not update cgroup for container", e);
throw new ResourceHandlerException(e);
}
List<PrivilegedOperation> ret = new ArrayList<>();
ret.add(new PrivilegedOperation(
PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP,
PrivilegedOperation.CGROUP_ARG_PREFIX
+ cGroupsHandler.getPathForCGroupTasks(
CGroupsHandler.CGroupController.DEVICES, containerIdStr)));
return ret;
}
@VisibleForTesting
public GpuResourceAllocator getGpuAllocator() {
return gpuAllocator;
}
@Override
public List<PrivilegedOperation> reacquireContainer(ContainerId containerId)
throws ResourceHandlerException {
gpuAllocator.recoverAssignedGpus(containerId);
return null;
}
@Override
public synchronized List<PrivilegedOperation> postComplete(
ContainerId containerId) throws ResourceHandlerException {
gpuAllocator.cleanupAssignGpus(containerId);
cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES,
containerId.toString());
return null;
}
@Override
public List<PrivilegedOperation> teardown() throws ResourceHandlerException {
return null;
}
}

View File

@ -0,0 +1,52 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.exceptions.YarnException;
/**
* Plugins to handle resources on a node. This will be used by
* {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater}
*/
public abstract class NodeResourceUpdaterPlugin {
/**
* Update configured resource for the given component.
* @param res resource passed in by external mododule (such as
* {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater}
* @throws YarnException when any issue happens.
*/
public abstract void updateConfiguredResource(Resource res)
throws YarnException;
/**
* This method will be called when the node's resource is loaded from
* dynamic-resources.xml in ResourceManager.
*
* @param newResource newResource reported by RM
* @throws YarnException when any mismatch between NM/RM
*/
public void handleUpdatedResourceFromRM(Resource newResource) throws
YarnException {
// by default do nothing, subclass should implement this method when any
// special activities required upon new resource reported by RM.
}
// TODO: add implementation to update node attribute once YARN-3409 merged.
}

View File

@ -0,0 +1,83 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
/**
* {@link ResourcePlugin} is an interface for node manager to easier support
* discovery/manage/isolation for new resource types.
*
* <p>
* It has two major part: {@link ResourcePlugin#createResourceHandler(Context,
* CGroupsHandler, PrivilegedOperationExecutor)} and
* {@link ResourcePlugin#getNodeResourceHandlerInstance()}, see javadocs below
* for more details.
* </p>
*/
public interface ResourcePlugin {
/**
* Initialize the plugin, this will be invoked during NM startup.
* @param context NM Context
* @throws YarnException when any issue occurs
*/
void initialize(Context context) throws YarnException;
/**
* Plugin needs to return {@link ResourceHandler} when any special isolation
* required for the resource type. This will be added to
* {@link ResourceHandlerChain} during NodeManager startup. When no special
* isolation need, return null.
*
* @param nmContext NodeManager context.
* @param cGroupsHandler CGroupsHandler
* @param privilegedOperationExecutor Privileged Operation Executor.
* @return ResourceHandler
*/
ResourceHandler createResourceHandler(Context nmContext,
CGroupsHandler cGroupsHandler,
PrivilegedOperationExecutor privilegedOperationExecutor);
/**
* Plugin needs to return {@link NodeResourceUpdaterPlugin} when any discovery
* mechanism required for the resource type. For example, if we want to set
* resource-value during NM registration or send update during NM-RM heartbeat
* We can implement a {@link NodeResourceUpdaterPlugin} and update fields of
* {@link org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest}
* or {@link org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest}
*
* This will be invoked during every node status update or node registration,
* please avoid creating new instance every time.
*
* @return NodeResourceUpdaterPlugin, could be null when no discovery needed.
*/
NodeResourceUpdaterPlugin getNodeResourceHandlerInstance();
/**
* Do cleanup of the plugin, this will be invoked when
* {@link org.apache.hadoop.yarn.server.nodemanager.NodeManager} stops
* @throws YarnException if any issue occurs
*/
void cleanup() throws YarnException;
}

View File

@ -0,0 +1,106 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
import com.google.common.collect.ImmutableSet;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
/**
* Manages {@link ResourcePlugin} configured on this NodeManager.
*/
public class ResourcePluginManager {
private static final Logger LOG =
LoggerFactory.getLogger(ResourcePluginManager.class);
private static final Set<String> SUPPORTED_RESOURCE_PLUGINS = ImmutableSet.of(
GPU_URI);
private Map<String, ResourcePlugin> configuredPlugins = Collections.EMPTY_MAP;
public synchronized void initialize(Context context)
throws YarnException {
Configuration conf = context.getConf();
String[] plugins = conf.getStrings(YarnConfiguration.NM_RESOURCE_PLUGINS);
if (plugins != null) {
Map<String, ResourcePlugin> pluginMap = new HashMap<>();
// Initialize each plugins
for (String resourceName : plugins) {
resourceName = resourceName.trim();
if (!SUPPORTED_RESOURCE_PLUGINS.contains(resourceName)) {
String msg =
"Trying to initialize resource plugin with name=" + resourceName
+ ", it is not supported, list of supported plugins:"
+ StringUtils.join(",",
SUPPORTED_RESOURCE_PLUGINS);
LOG.error(msg);
throw new YarnException(msg);
}
if (pluginMap.containsKey(resourceName)) {
// Duplicated items, ignore ...
continue;
}
ResourcePlugin plugin = null;
if (resourceName.equals(GPU_URI)) {
plugin = new GpuResourcePlugin();
}
if (plugin == null) {
throw new YarnException(
"This shouldn't happen, plugin=" + resourceName
+ " should be loaded and initialized");
}
plugin.initialize(context);
pluginMap.put(resourceName, plugin);
}
configuredPlugins = Collections.unmodifiableMap(pluginMap);
}
}
public synchronized void cleanup() throws YarnException {
for (ResourcePlugin plugin : configuredPlugins.values()) {
plugin.cleanup();
}
}
/**
* Get resource name (such as gpu/fpga) to plugin references.
* @return read-only map of resource name to plugins.
*/
public synchronized Map<String, ResourcePlugin> getNameToPlugins() {
return configuredPlugins;
}
}

View File

@ -0,0 +1,254 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class GpuDiscoverer {
public static final Logger LOG = LoggerFactory.getLogger(
GpuDiscoverer.class);
@VisibleForTesting
protected static final String DEFAULT_BINARY_NAME = "nvidia-smi";
// When executable path not set, try to search default dirs
// By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when
// launched by nvidia-docker.
private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
"/usr/bin", "/bin", "/usr/local/nvidia/bin");
// command should not run more than 10 sec.
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
private static GpuDiscoverer instance;
static {
instance = new GpuDiscoverer();
}
private Configuration conf = null;
private String pathOfGpuBinary = null;
private Map<String, String> environment = new HashMap<>();
private GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
private int numOfErrorExecutionSinceLastSucceed = 0;
GpuDeviceInformation lastDiscoveredGpuInformation = null;
private void validateConfOrThrowException() throws YarnException {
if (conf == null) {
throw new YarnException("Please initialize (call initialize) before use "
+ GpuDiscoverer.class.getSimpleName());
}
}
/**
* Get GPU device information from system.
* This need to be called after initialize.
*
* Please note that this only works on *NIX platform, so external caller
* need to make sure this.
*
* @return GpuDeviceInformation
* @throws YarnException when any error happens
*/
public synchronized GpuDeviceInformation getGpuDeviceInformation()
throws YarnException {
validateConfOrThrowException();
if (null == pathOfGpuBinary) {
throw new YarnException(
"Failed to find GPU discovery executable, please double check "
+ YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
}
if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
String msg =
"Failed to execute GPU device information detection script for "
+ MAX_REPEATED_ERROR_ALLOWED
+ " times, skip following executions.";
LOG.error(msg);
throw new YarnException(msg);
}
String output;
try {
output = Shell.execCommand(environment,
new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
GpuDeviceInformation info = parser.parseXml(output);
numOfErrorExecutionSinceLastSucceed = 0;
lastDiscoveredGpuInformation = info;
return info;
} catch (IOException e) {
numOfErrorExecutionSinceLastSucceed++;
String msg =
"Failed to execute " + pathOfGpuBinary + " exception message:" + e
.getMessage() + ", continue ...";
if (LOG.isDebugEnabled()) {
LOG.debug(msg);
}
throw new YarnException(e);
} catch (YarnException e) {
numOfErrorExecutionSinceLastSucceed++;
String msg = "Failed to parse xml output" + e.getMessage();
if (LOG.isDebugEnabled()) {
LOG.warn(msg, e);
}
throw e;
}
}
/**
* Get list of minor device numbers of Gpu devices usable by YARN.
*
* @return List of minor device numbers of Gpu devices.
* @throws YarnException when any issue happens
*/
public synchronized List<Integer> getMinorNumbersOfGpusUsableByYarn()
throws YarnException {
validateConfOrThrowException();
String allowedDevicesStr = conf.get(
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
List<Integer> minorNumbers = new ArrayList<>();
if (allowedDevicesStr.equals(
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
// Get gpu device information from system.
if (null == lastDiscoveredGpuInformation) {
String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to "
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES
+ ", however automatically discovering "
+ "GPU information failed, please check NodeManager log for more"
+ " details, as an alternative, admin can specify "
+ YarnConfiguration.NM_GPU_ALLOWED_DEVICES
+ " manually to enable GPU isolation.";
LOG.error(msg);
throw new YarnException(msg);
}
if (lastDiscoveredGpuInformation.getGpus() != null) {
for (PerGpuDeviceInformation gpu : lastDiscoveredGpuInformation
.getGpus()) {
minorNumbers.add(gpu.getMinorNumber());
}
}
} else{
for (String s : allowedDevicesStr.split(",")) {
if (s.trim().length() > 0) {
minorNumbers.add(Integer.valueOf(s.trim()));
}
}
LOG.info("Allowed GPU devices with minor numbers:" + allowedDevicesStr);
}
return minorNumbers;
}
public synchronized void initialize(Configuration conf) throws YarnException {
this.conf = conf;
numOfErrorExecutionSinceLastSucceed = 0;
String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC);
if (pathToExecutable.isEmpty()) {
pathToExecutable = DEFAULT_BINARY_NAME;
}
// Validate file existence
File binaryPath = new File(pathToExecutable);
if (!binaryPath.exists()) {
// When binary not exist, use default setting.
boolean found = false;
for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
binaryPath = new File(dir, DEFAULT_BINARY_NAME);
if (binaryPath.exists()) {
found = true;
pathOfGpuBinary = binaryPath.getAbsolutePath();
break;
}
}
if (!found) {
LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath()
+ ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC
+ "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME);
}
} else{
// If path specified by user is a directory, use
if (binaryPath.isDirectory()) {
binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME);
LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
+ " under the directory, updated path-to-executable:" + binaryPath
.getAbsolutePath());
}
// Validated
pathOfGpuBinary = binaryPath.getAbsolutePath();
}
// Try to discover GPU information once and print
try {
LOG.info("Trying to discover GPU information ...");
GpuDeviceInformation info = getGpuDeviceInformation();
LOG.info(info.toString());
} catch (YarnException e) {
String msg =
"Failed to discover GPU information from system, exception message:"
+ e.getMessage() + " continue...";
LOG.warn(msg);
}
}
@VisibleForTesting
protected Map<String, String> getEnvironmentToRunCommand() {
return environment;
}
@VisibleForTesting
protected String getPathOfGpuBinary() {
return pathOfGpuBinary;
}
public static GpuDiscoverer getInstance() {
return instance;
}
}

View File

@ -0,0 +1,66 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Map;
import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
private static final Logger LOG =
LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
@Override
public void updateConfiguredResource(Resource res) throws YarnException {
LOG.info("Initializing configured GPU resources for the NodeManager.");
List<Integer> usableGpus =
GpuDiscoverer.getInstance().getMinorNumbersOfGpusUsableByYarn();
if (null == usableGpus || usableGpus.isEmpty()) {
LOG.info("Didn't find any usable GPUs on the NodeManager.");
// No gpu can be used by YARN.
return;
}
long nUsableGpus = usableGpus.size();
Map<String, ResourceInformation> configuredResourceTypes =
ResourceUtils.getResourceTypes();
if (!configuredResourceTypes.containsKey(GPU_URI)) {
throw new YarnException("Found " + nUsableGpus + " usable GPUs, however "
+ GPU_URI
+ " resource-type is not configured inside"
+ " resource-types.xml, please configure it to enable GPU feature or"
+ " remove " + GPU_URI + " from "
+ YarnConfiguration.NM_RESOURCE_PLUGINS);
}
res.setResourceValue(GPU_URI, nUsableGpus);
}
}

View File

@ -0,0 +1,61 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
public class GpuResourcePlugin implements ResourcePlugin {
private ResourceHandler gpuResourceHandler = null;
private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null;
@Override
public synchronized void initialize(Context context) throws YarnException {
resourceDiscoverHandler = new GpuNodeResourceUpdateHandler();
GpuDiscoverer.getInstance().initialize(context.getConf());
}
@Override
public synchronized ResourceHandler createResourceHandler(
Context context, CGroupsHandler cGroupsHandler,
PrivilegedOperationExecutor privilegedOperationExecutor) {
if (gpuResourceHandler == null) {
gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
privilegedOperationExecutor);
}
return gpuResourceHandler;
}
@Override
public synchronized NodeResourceUpdaterPlugin getNodeResourceHandlerInstance() {
return resourceDiscoverHandler;
}
@Override
public void cleanup() throws YarnException {
// Do nothing.
}
}

View File

@ -0,0 +1,72 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import javax.xml.bind.annotation.XmlRootElement;
import java.util.List;
/**
* All GPU Device Information in the system.
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
@XmlRootElement(name = "nvidia_smi_log")
public class GpuDeviceInformation {
List<PerGpuDeviceInformation> gpus;
String driverVersion = "N/A";
// More fields like topology information could be added when needed.
// ...
@javax.xml.bind.annotation.XmlElement(name = "gpu")
public List<PerGpuDeviceInformation> getGpus() {
return gpus;
}
public void setGpus(List<PerGpuDeviceInformation> gpus) {
this.gpus = gpus;
}
@javax.xml.bind.annotation.XmlElement(name = "driver_version")
public String getDriverVersion() {
return driverVersion;
}
public void setDriverVersion(String driverVersion) {
this.driverVersion = driverVersion;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("=== Gpus in the system ===\n").append("\tDriver Version:").append(
getDriverVersion()).append("\n");
if (gpus != null) {
for (PerGpuDeviceInformation gpu : gpus) {
sb.append("\t").append(gpu.toString()).append("\n");
}
}
return sb.toString();
}
}

View File

@ -0,0 +1,87 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.sax.SAXSource;
import java.io.StringReader;
/**
* Parse XML and get GPU device information
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class GpuDeviceInformationParser {
private static final Logger LOG = LoggerFactory.getLogger(
GpuDeviceInformationParser.class);
private Unmarshaller unmarshaller = null;
private XMLReader xmlReader = null;
private void init()
throws SAXException, ParserConfigurationException, JAXBException {
SAXParserFactory spf = SAXParserFactory.newInstance();
// Disable external-dtd since by default nvidia-smi output contains
// <!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v8.dtd"> in header
spf.setFeature(
"http://apache.org/xml/features/nonvalidating/load-external-dtd",
false);
spf.setFeature("http://xml.org/sax/features/validation", false);
JAXBContext jaxbContext = JAXBContext.newInstance(
GpuDeviceInformation.class);
this.xmlReader = spf.newSAXParser().getXMLReader();
this.unmarshaller = jaxbContext.createUnmarshaller();
}
public synchronized GpuDeviceInformation parseXml(String xmlContent)
throws YarnException {
if (unmarshaller == null) {
try {
init();
} catch (SAXException | ParserConfigurationException | JAXBException e) {
LOG.error("Exception while initialize parser", e);
throw new YarnException(e);
}
}
InputSource inputSource = new InputSource(new StringReader(xmlContent));
SAXSource source = new SAXSource(xmlReader, inputSource);
try {
return (GpuDeviceInformation) unmarshaller.unmarshal(source);
} catch (JAXBException e) {
LOG.error("Exception while parsing xml", e);
throw new YarnException(e);
}
}
}

View File

@ -0,0 +1,165 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import javax.xml.bind.annotation.adapters.XmlAdapter;
/**
* Capture single GPU device information such as memory size, temperature,
* utilization.
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
@XmlRootElement(name = "gpu")
public class PerGpuDeviceInformation {
private String productName = "N/A";
private String uuid = "N/A";
private int minorNumber = -1;
private PerGpuUtilizations gpuUtilizations;
private PerGpuMemoryUsage gpuMemoryUsage;
private PerGpuTemperature temperature;
/**
* Convert formats like "34 C", "75.6 %" to float.
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
static class StrToFloatBeforeSpaceAdapter extends
XmlAdapter<String, Float> {
@Override
public String marshal(Float v) throws Exception {
if (v == null) {
return "";
}
return String.valueOf(v);
}
@Override
public Float unmarshal(String v) throws Exception {
if (v == null) {
return -1f;
}
return Float.valueOf(v.split(" ")[0]);
}
}
/**
* Convert formats like "725 MiB" to long.
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
static class StrToMemAdapter extends XmlAdapter<String, Long> {
@Override
public String marshal(Long v) throws Exception {
if (v == null) {
return "";
}
return String.valueOf(v) + " MiB";
}
@Override
public Long unmarshal(String v) throws Exception {
if (v == null) {
return -1L;
}
return Long.valueOf(v.split(" ")[0]);
}
}
@XmlElement(name = "temperature")
public PerGpuTemperature getTemperature() {
return temperature;
}
public void setTemperature(PerGpuTemperature temperature) {
this.temperature = temperature;
}
@XmlElement(name = "uuid")
public String getUuid() {
return uuid;
}
public void setUuid(String uuid) {
this.uuid = uuid;
}
@XmlElement(name = "product_name")
public String getProductName() {
return productName;
}
public void setProductName(String productName) {
this.productName = productName;
}
@XmlElement(name = "minor_number")
public int getMinorNumber() {
return minorNumber;
}
public void setMinorNumber(int minorNumber) {
this.minorNumber = minorNumber;
}
@XmlElement(name = "utilization")
public PerGpuUtilizations getGpuUtilizations() {
return gpuUtilizations;
}
public void setGpuUtilizations(PerGpuUtilizations utilizations) {
this.gpuUtilizations = utilizations;
}
@XmlElement(name = "bar1_memory_usage")
public PerGpuMemoryUsage getGpuMemoryUsage() {
return gpuMemoryUsage;
}
public void setGpuMemoryUsage(PerGpuMemoryUsage gpuMemoryUsage) {
this.gpuMemoryUsage = gpuMemoryUsage;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("ProductName=").append(productName).append(", MinorNumber=")
.append(minorNumber);
if (getGpuMemoryUsage() != null) {
sb.append(", TotalMemory=").append(
getGpuMemoryUsage().getTotalMemoryMiB()).append("MiB");
}
if (getGpuUtilizations() != null) {
sb.append(", Utilization=").append(
getGpuUtilizations().getOverallGpuUtilization()).append("%");
}
return sb.toString();
}
}

View File

@ -0,0 +1,58 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
@InterfaceAudience.Private
@InterfaceStability.Unstable
@XmlRootElement(name = "bar1_memory_usage")
public class PerGpuMemoryUsage {
long usedMemoryMiB = -1L;
long availMemoryMiB = -1L;
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
@XmlElement(name = "used")
public Long getUsedMemoryMiB() {
return usedMemoryMiB;
}
public void setUsedMemoryMiB(Long usedMemoryMiB) {
this.usedMemoryMiB = usedMemoryMiB;
}
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
@XmlElement(name = "free")
public Long getAvailMemoryMiB() {
return availMemoryMiB;
}
public void setAvailMemoryMiB(Long availMemoryMiB) {
this.availMemoryMiB = availMemoryMiB;
}
public long getTotalMemoryMiB() {
return usedMemoryMiB + availMemoryMiB;
}
}

View File

@ -0,0 +1,80 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
/**
* Temperature of GPU
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
@XmlRootElement(name = "temperature")
public class PerGpuTemperature {
private float currentGpuTemp = Float.MIN_VALUE;
private float maxGpuTemp = Float.MIN_VALUE;
private float slowThresholdGpuTemp = Float.MIN_VALUE;
/**
* Get current celsius GPU temperature
* @return temperature
*/
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
@XmlElement(name = "gpu_temp")
public Float getCurrentGpuTemp() {
return currentGpuTemp;
}
public void setCurrentGpuTemp(Float currentGpuTemp) {
this.currentGpuTemp = currentGpuTemp;
}
/**
* Get max possible celsius GPU temperature
* @return temperature
*/
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
@XmlElement(name = "gpu_temp_max_threshold")
public Float getMaxGpuTemp() {
return maxGpuTemp;
}
public void setMaxGpuTemp(Float maxGpuTemp) {
this.maxGpuTemp = maxGpuTemp;
}
/**
* Get celsius GPU temperature which could make GPU runs slower
* @return temperature
*/
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
@XmlElement(name = "gpu_temp_slow_threshold")
public Float getSlowThresholdGpuTemp() {
return slowThresholdGpuTemp;
}
public void setSlowThresholdGpuTemp(Float slowThresholdGpuTemp) {
this.slowThresholdGpuTemp = slowThresholdGpuTemp;
}
}

View File

@ -0,0 +1,50 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
/**
* GPU utilizations
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
@XmlRootElement(name = "utilization")
public class PerGpuUtilizations {
private float overallGpuUtilization;
/**
* Overall percent GPU utilization
* @return utilization
*/
@XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
@XmlElement(name = "gpu_util")
public Float getOverallGpuUtilization() {
return overallGpuUtilization;
}
public void setOverallGpuUtilization(Float overallGpuUtilization) {
this.overallGpuUtilization = overallGpuUtilization;
}
}

View File

@ -0,0 +1,164 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.net.ServerSocketUtil;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse;
import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl;
import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl;
import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.junit.Assert;
import org.junit.Before;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
public class NodeManagerTestBase {
// temp fix until metrics system can auto-detect itself running in unit test:
static {
DefaultMetricsSystem.setMiniClusterMode(true);
}
protected static final Logger LOG =
LoggerFactory.getLogger(TestNodeStatusUpdater.class);
protected static final File basedir =
new File("target", TestNodeStatusUpdater.class.getName());
protected static final File nmLocalDir = new File(basedir, "nm0");
protected static final File tmpDir = new File(basedir, "tmpDir");
protected static final File remoteLogsDir = new File(basedir, "remotelogs");
protected static final File logsDir = new File(basedir, "logs");
protected static final RecordFactory recordFactory = RecordFactoryProvider
.getRecordFactory(null);
protected Configuration conf;
protected YarnConfiguration createNMConfig() throws IOException {
return createNMConfig(ServerSocketUtil.getPort(49170, 10));
}
protected YarnConfiguration createNMConfig(int port) throws IOException {
YarnConfiguration conf = new YarnConfiguration();
String localhostAddress = null;
try {
localhostAddress = InetAddress.getByName("localhost")
.getCanonicalHostName();
} catch (UnknownHostException e) {
Assert.fail("Unable to get localhost address: " + e.getMessage());
}
conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
+ ServerSocketUtil.getPort(49160, 10));
conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
remoteLogsDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
return conf;
}
public static class BaseResourceTrackerForTest implements ResourceTracker {
@Override
public RegisterNodeManagerResponse registerNodeManager(
RegisterNodeManagerRequest request) throws YarnException, IOException {
return new RegisterNodeManagerResponsePBImpl();
}
@Override
public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
throws YarnException, IOException {
return new NodeHeartbeatResponsePBImpl();
}
@Override
public UnRegisterNodeManagerResponse unRegisterNodeManager(
UnRegisterNodeManagerRequest request)
throws YarnException, IOException {
return new UnRegisterNodeManagerResponsePBImpl();
}
}
protected static class BaseNodeStatusUpdaterForTest extends NodeStatusUpdaterImpl {
public ResourceTracker resourceTracker;
protected Context context;
public BaseNodeStatusUpdaterForTest(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
ResourceTracker resourceTracker) {
super(context, dispatcher, healthChecker, metrics);
this.context = context;
this.resourceTracker = resourceTracker;
}
@Override
protected ResourceTracker getRMClient() {
return resourceTracker;
}
@Override
protected void stopRMProxy() {
return;
}
}
public class MyContainerManager extends ContainerManagerImpl {
public boolean signaled = false;
public MyContainerManager(Context context, ContainerExecutor exec,
DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
NodeManagerMetrics metrics,
LocalDirsHandlerService dirsHandler) {
super(context, exec, deletionContext, nodeStatusUpdater,
metrics, dirsHandler);
}
@Override
public void handle(ContainerManagerEvent event) {
if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
signaled = true;
}
}
}
@Before
public void setUp() throws IOException {
nmLocalDir.mkdirs();
tmpDir.mkdirs();
logsDir.mkdirs();
remoteLogsDir.mkdirs();
conf = createNMConfig();
}
}

View File

@ -178,7 +178,7 @@ public class TestDefaultContainerExecutor {
FileContext lfs = FileContext.getLocalFSFileContext(conf);
DefaultContainerExecutor executor = new DefaultContainerExecutor(lfs);
executor.setConf(conf);
executor.init();
executor.init(null);
try {
executor.createUserLocalDirs(localDirs, user);
@ -317,7 +317,7 @@ public class TestDefaultContainerExecutor {
Path workDir = localDir;
Path pidFile = new Path(workDir, "pid.txt");
mockExec.init();
mockExec.init(null);
mockExec.activateContainer(cId, pidFile);
int ret = mockExec.launchContainer(new ContainerStartContext.Builder()
.setContainer(container)

View File

@ -628,7 +628,7 @@ public class TestLinuxContainerExecutor {
LinuxContainerExecutor lce = new LinuxContainerExecutor();
lce.setConf(conf);
try {
lce.init();
lce.init(null);
} catch (IOException e) {
// expected if LCE isn't setup right, but not necessary for this test
}

View File

@ -415,7 +415,7 @@ public class TestLinuxContainerExecutorWithMocks {
@Test
public void testInit() throws Exception {
mockExec.init();
mockExec.init(mock(Context.class));
assertEquals(Arrays.asList("--checksetup"), readMockParams());
}

View File

@ -37,7 +37,7 @@ public class TestNodeManager {
public static final class InvalidContainerExecutor extends
DefaultContainerExecutor {
@Override
public void init() throws IOException {
public void init(Context nmContext) throws IOException {
throw new IOException("dummy executor init called");
}
}

View File

@ -20,16 +20,14 @@ package org.apache.hadoop.yarn.server.nodemanager;
import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
@ -81,8 +79,6 @@ import org.apache.hadoop.yarn.event.Event;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeHeartbeatResponseProto;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
@ -118,41 +114,14 @@ import org.junit.Before;
import org.junit.Test;
@SuppressWarnings("rawtypes")
public class TestNodeStatusUpdater {
// temp fix until metrics system can auto-detect itself running in unit test:
static {
DefaultMetricsSystem.setMiniClusterMode(true);
}
static final Logger LOG =
LoggerFactory.getLogger(TestNodeStatusUpdater.class);
static final File basedir =
new File("target", TestNodeStatusUpdater.class.getName());
static final File nmLocalDir = new File(basedir, "nm0");
static final File tmpDir = new File(basedir, "tmpDir");
static final File remoteLogsDir = new File(basedir, "remotelogs");
static final File logsDir = new File(basedir, "logs");
private static final RecordFactory recordFactory = RecordFactoryProvider
.getRecordFactory(null);
public class TestNodeStatusUpdater extends NodeManagerTestBase {
volatile int heartBeatID = 0;
volatile Throwable nmStartError = null;
private final List<NodeId> registeredNodes = new ArrayList<NodeId>();
private boolean triggered = false;
private Configuration conf;
private NodeManager nm;
private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false);
@Before
public void setUp() throws IOException {
nmLocalDir.mkdirs();
tmpDir.mkdirs();
logsDir.mkdirs();
remoteLogsDir.mkdirs();
conf = createNMConfig();
}
@After
public void tearDown() {
this.registeredNodes.clear();
@ -334,29 +303,7 @@ public class TestNodeStatusUpdater {
}
}
private class MyContainerManager extends ContainerManagerImpl {
public boolean signaled = false;
public MyContainerManager(Context context, ContainerExecutor exec,
DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
NodeManagerMetrics metrics,
LocalDirsHandlerService dirsHandler) {
super(context, exec, deletionContext, nodeStatusUpdater,
metrics, dirsHandler);
}
@Override
public void handle(ContainerManagerEvent event) {
if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
signaled = true;
}
}
}
private class MyNodeStatusUpdater extends NodeStatusUpdaterImpl {
public ResourceTracker resourceTracker;
private Context context;
private class MyNodeStatusUpdater extends BaseNodeStatusUpdaterForTest {
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
this(context, dispatcher, healthChecker, metrics, false);
@ -365,19 +312,8 @@ public class TestNodeStatusUpdater {
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
boolean signalContainer) {
super(context, dispatcher, healthChecker, metrics);
this.context = context;
resourceTracker = new MyResourceTracker(this.context, signalContainer);
}
@Override
protected ResourceTracker getRMClient() {
return resourceTracker;
}
@Override
protected void stopRMProxy() {
return;
super(context, dispatcher, healthChecker, metrics,
new MyResourceTracker(context, signalContainer));
}
}
@ -1820,7 +1756,6 @@ public class TestNodeStatusUpdater {
Assert.assertTrue("Test failed with exception(s)" + exceptions,
exceptions.isEmpty());
}
// Add new containers info into NM context each time node heart beats.
private class MyNMContext extends NMContext {
@ -1924,31 +1859,6 @@ public class TestNodeStatusUpdater {
this.registeredNodes.size());
}
private YarnConfiguration createNMConfig(int port) throws IOException {
YarnConfiguration conf = new YarnConfiguration();
String localhostAddress = null;
try {
localhostAddress = InetAddress.getByName("localhost")
.getCanonicalHostName();
} catch (UnknownHostException e) {
Assert.fail("Unable to get localhost address: " + e.getMessage());
}
conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
+ ServerSocketUtil.getPort(49160, 10));
conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
remoteLogsDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
return conf;
}
private YarnConfiguration createNMConfig() throws IOException {
return createNMConfig(ServerSocketUtil.getPort(49170, 10));
}
private NodeManager getNodeManager(final NodeAction nodeHeartBeatAction) {
return new NodeManager() {
@Override

View File

@ -18,26 +18,6 @@
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
import java.io.IOException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
@ -64,6 +44,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
@ -72,17 +53,36 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.util.Records;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
/**
* Base class for all the AMRMProxyService test cases. It provides utility
@ -773,5 +773,9 @@ public abstract class BaseAMRMProxyTest {
getContainerStateTransitionListener() {
return null;
}
public ResourcePluginManager getResourcePluginManager() {
return null;
}
}
}

View File

@ -22,6 +22,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
@ -30,6 +31,8 @@ import org.slf4j.LoggerFactory;
import java.util.List;
import static org.mockito.Mockito.mock;
public class TestResourceHandlerModule {
private static final Logger LOG =
LoggerFactory.getLogger(TestResourceHandlerModule.class);
@ -62,7 +65,7 @@ public class TestResourceHandlerModule {
//Ensure that outbound bandwidth resource handler is present in the chain
ResourceHandlerChain resourceHandlerChain = ResourceHandlerModule
.getConfiguredResourceHandlerChain(networkEnabledConf);
.getConfiguredResourceHandlerChain(networkEnabledConf, mock(Context.class));
List<ResourceHandler> resourceHandlers = resourceHandlerChain
.getResourceHandlerList();
//Exactly one resource handler in chain
@ -88,7 +91,8 @@ public class TestResourceHandlerModule {
Assert.assertNotNull(handler);
ResourceHandlerChain resourceHandlerChain =
ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf);
ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf,
mock(Context.class));
List<ResourceHandler> resourceHandlers =
resourceHandlerChain.getResourceHandlerList();
// Exactly one resource handler in chain

View File

@ -0,0 +1,382 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyList;
import static org.mockito.Matchers.anyString;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
public class TestGpuResourceHandler {
private CGroupsHandler mockCGroupsHandler;
private PrivilegedOperationExecutor mockPrivilegedExecutor;
private GpuResourceHandlerImpl gpuResourceHandler;
private NMStateStoreService mockNMStateStore;
private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
@Before
public void setup() {
TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
mockCGroupsHandler = mock(CGroupsHandler.class);
mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
mockNMStateStore = mock(NMStateStoreService.class);
Context nmctx = mock(Context.class);
when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore);
runningContainersMap = new ConcurrentHashMap<>();
when(nmctx.getContainers()).thenReturn(runningContainersMap);
gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
mockPrivilegedExecutor);
}
@Test
public void testBootStrap() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
verify(mockCGroupsHandler, times(1)).initializeCGroupController(
CGroupsHandler.CGroupController.DEVICES);
}
private static ContainerId getContainerId(int id) {
return ContainerId.newContainerId(ApplicationAttemptId
.newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
}
private static Container mockContainerWithGpuRequest(int id,
int numGpuRequest) {
Container c = mock(Container.class);
when(c.getContainerId()).thenReturn(getContainerId(id));
Resource res = Resource.newInstance(1024, 1);
ResourceMappings resMapping = new ResourceMappings();
res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
when(c.getResource()).thenReturn(res);
when(c.getResourceMappings()).thenReturn(resMapping);
return c;
}
private void verifyDeniedDevices(ContainerId containerId,
List<Integer> deniedDevices)
throws ResourceHandlerException, PrivilegedOperationException {
verify(mockCGroupsHandler, times(1)).createCGroup(
CGroupsHandler.CGroupController.DEVICES, containerId.toString());
if (null != deniedDevices && !deniedDevices.isEmpty()) {
verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
.asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
containerId.toString(),
GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
StringUtils.join(",", deniedDevices))), true);
}
}
@Test
public void testAllocation() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
/* Start container 1, asks 3 containers */
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
// Only device=4 will be blocked.
verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
/* Start container 2, asks 2 containers. Excepted to fail */
boolean failedToAllocate = false;
try {
gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2));
} catch (ResourceHandlerException e) {
failedToAllocate = true;
}
Assert.assertTrue(failedToAllocate);
/* Start container 3, ask 1 container, succeeded */
gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1));
// devices = 0/1/3 will be blocked
verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3));
/* Start container 4, ask 0 container, succeeded */
gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0));
// All devices will be blocked
verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4));
/* Release container-1, expect cgroups deleted */
gpuResourceHandler.postComplete(getContainerId(1));
verify(mockCGroupsHandler, times(1)).createCGroup(
CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
Assert.assertEquals(3,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
/* Release container-3, expect cgroups deleted */
gpuResourceHandler.postComplete(getContainerId(3));
verify(mockCGroupsHandler, times(1)).createCGroup(
CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString());
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
}
@SuppressWarnings("unchecked")
@Test
public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
doThrow(new IOException("Exception ...")).when(mockNMStateStore)
.storeAssignedResources(
any(ContainerId.class), anyString(), anyList());
boolean exception = false;
/* Start container 1, asks 3 containers */
try {
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
} catch (ResourceHandlerException e) {
exception = true;
}
Assert.assertTrue("preStart should throw exception", exception);
// After preStart, we still have 4 available GPU since the store op fails.
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
}
@Test
public void testAllocationWithoutAllowedGpus() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(0,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
/* Start container 1, asks 0 containers */
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
verifyDeniedDevices(getContainerId(1), Collections.emptyList());
/* Start container 2, asks 1 containers. Excepted to fail */
boolean failedToAllocate = false;
try {
gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1));
} catch (ResourceHandlerException e) {
failedToAllocate = true;
}
Assert.assertTrue(failedToAllocate);
/* Release container 1, expect cgroups deleted */
gpuResourceHandler.postComplete(getContainerId(1));
verify(mockCGroupsHandler, times(1)).createCGroup(
CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
Assert.assertEquals(0,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
}
@Test
public void testAllocationStored() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
/* Start container 1, asks 3 containers */
Container container = mockContainerWithGpuRequest(1, 3);
gpuResourceHandler.preStart(container);
verify(mockNMStateStore).storeAssignedResources(getContainerId(1),
ResourceInformation.GPU_URI,
Arrays.asList("0", "1", "3"));
Assert.assertEquals(3, container.getResourceMappings()
.getAssignedResources(ResourceInformation.GPU_URI).size());
// Only device=4 will be blocked.
verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
/* Start container 2, ask 0 container, succeeded */
container = mockContainerWithGpuRequest(2, 0);
gpuResourceHandler.preStart(container);
verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4));
Assert.assertEquals(0, container.getResourceMappings()
.getAssignedResources(ResourceInformation.GPU_URI).size());
// Store assigned resource will not be invoked.
verify(mockNMStateStore, never()).storeAssignedResources(
eq(getContainerId(2)), eq(ResourceInformation.GPU_URI), anyList());
}
@Test
public void testRecoverResourceAllocation() throws Exception {
Configuration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
Assert.assertEquals(4,
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
Container nmContainer = mock(Container.class);
ResourceMappings rmap = new ResourceMappings();
ResourceMappings.AssignedResources ar =
new ResourceMappings.AssignedResources();
ar.updateAssignedResources(Arrays.asList("1", "3"));
rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
when(nmContainer.getResourceMappings()).thenReturn(rmap);
runningContainersMap.put(getContainerId(1), nmContainer);
// TEST CASE
// Reacquire container restore state of GPU Resource Allocator.
gpuResourceHandler.reacquireContainer(getContainerId(1));
Map<Integer, ContainerId> deviceAllocationMapping =
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
Assert.assertEquals(2, deviceAllocationMapping.size());
Assert.assertTrue(
deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
// TEST CASE
// Try to reacquire a container but requested device is not in allowed list.
nmContainer = mock(Container.class);
rmap = new ResourceMappings();
ar = new ResourceMappings.AssignedResources();
// id=5 is not in allowed list.
ar.updateAssignedResources(Arrays.asList("4", "5"));
rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
when(nmContainer.getResourceMappings()).thenReturn(rmap);
runningContainersMap.put(getContainerId(2), nmContainer);
boolean caughtException = false;
try {
gpuResourceHandler.reacquireContainer(getContainerId(1));
} catch (ResourceHandlerException e) {
caughtException = true;
}
Assert.assertTrue(
"Should fail since requested device Id is not in allowed list",
caughtException);
// Make sure internal state not changed.
deviceAllocationMapping =
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
Assert.assertEquals(2, deviceAllocationMapping.size());
Assert.assertTrue(
deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
// TEST CASE
// Try to reacquire a container but requested device is already assigned.
nmContainer = mock(Container.class);
rmap = new ResourceMappings();
ar = new ResourceMappings.AssignedResources();
// id=3 is already assigned
ar.updateAssignedResources(Arrays.asList("4", "3"));
rmap.addAssignedResources("gpu", ar);
when(nmContainer.getResourceMappings()).thenReturn(rmap);
runningContainersMap.put(getContainerId(2), nmContainer);
caughtException = false;
try {
gpuResourceHandler.reacquireContainer(getContainerId(1));
} catch (ResourceHandlerException e) {
caughtException = true;
}
Assert.assertTrue(
"Should fail since requested device Id is not in allowed list",
caughtException);
// Make sure internal state not changed.
deviceAllocationMapping =
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
Assert.assertEquals(2, deviceAllocationMapping.size());
Assert.assertTrue(
deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
}
}

View File

@ -73,7 +73,7 @@ public class TestContainersMonitorResourceChange {
private static class MockExecutor extends ContainerExecutor {
@Override
public void init() throws IOException {
public void init(Context nmContext) throws IOException {
}
@Override
public void startLocalizer(LocalizerStartContext ctx)

View File

@ -0,0 +1,261 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.service.ServiceOperations;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.junit.After;
import org.junit.Assert;
import org.junit.Test;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
public class TestResourcePluginManager extends NodeManagerTestBase {
private NodeManager nm;
ResourcePluginManager stubResourcePluginmanager() {
// Stub ResourcePluginManager
final ResourcePluginManager rpm = mock(ResourcePluginManager.class);
Map<String, ResourcePlugin> plugins = new HashMap<>();
// First resource plugin
ResourcePlugin resourcePlugin = mock(ResourcePlugin.class);
NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin = mock(
NodeResourceUpdaterPlugin.class);
when(resourcePlugin.getNodeResourceHandlerInstance()).thenReturn(
nodeResourceUpdaterPlugin);
plugins.put("resource1", resourcePlugin);
// Second resource plugin
resourcePlugin = mock(ResourcePlugin.class);
when(resourcePlugin.createResourceHandler(any(Context.class), any(
CGroupsHandler.class), any(PrivilegedOperationExecutor.class)))
.thenReturn(new CustomizedResourceHandler());
plugins.put("resource2", resourcePlugin);
when(rpm.getNameToPlugins()).thenReturn(plugins);
return rpm;
}
@After
public void tearDown() {
if (nm != null) {
try {
ServiceOperations.stop(nm);
} catch (Throwable t) {
// ignore
}
}
}
private class CustomizedResourceHandler implements ResourceHandler {
@Override
public List<PrivilegedOperation> bootstrap(Configuration configuration)
throws ResourceHandlerException {
return null;
}
@Override
public List<PrivilegedOperation> preStart(Container container)
throws ResourceHandlerException {
return null;
}
@Override
public List<PrivilegedOperation> reacquireContainer(ContainerId containerId)
throws ResourceHandlerException {
return null;
}
@Override
public List<PrivilegedOperation> postComplete(ContainerId containerId)
throws ResourceHandlerException {
return null;
}
@Override
public List<PrivilegedOperation> teardown()
throws ResourceHandlerException {
return null;
}
}
private class MyMockNM extends NodeManager {
private final ResourcePluginManager rpm;
public MyMockNM(ResourcePluginManager rpm) {
this.rpm = rpm;
}
@Override
protected NodeStatusUpdater createNodeStatusUpdater(Context context,
Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
((NodeManager.NMContext)context).setResourcePluginManager(rpm);
return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
metrics, new BaseResourceTrackerForTest());
}
@Override
protected ContainerManagerImpl createContainerManager(Context context,
ContainerExecutor exec, DeletionService del,
NodeStatusUpdater nodeStatusUpdater,
ApplicationACLsManager aclsManager,
LocalDirsHandlerService diskhandler) {
return new MyContainerManager(context, exec, del, nodeStatusUpdater,
metrics, diskhandler);
}
@Override
protected ResourcePluginManager createResourcePluginManager() {
return rpm;
}
}
public class MyLCE extends LinuxContainerExecutor {
private PrivilegedOperationExecutor poe = mock(PrivilegedOperationExecutor.class);
@Override
protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
return poe;
}
}
/*
* Make sure ResourcePluginManager is initialized during NM start up.
*/
@Test(timeout = 30000)
public void testResourcePluginManagerInitialization() throws Exception {
final ResourcePluginManager rpm = stubResourcePluginmanager();
nm = new MyMockNM(rpm);
YarnConfiguration conf = createNMConfig();
nm.init(conf);
verify(rpm, times(1)).initialize(
any(Context.class));
}
/*
* Make sure ResourcePluginManager is invoked during NM update.
*/
@Test(timeout = 30000)
public void testNodeStatusUpdaterWithResourcePluginsEnabled() throws Exception {
final ResourcePluginManager rpm = stubResourcePluginmanager();
nm = new MyMockNM(rpm);
YarnConfiguration conf = createNMConfig();
nm.init(conf);
nm.start();
NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin =
rpm.getNameToPlugins().get("resource1")
.getNodeResourceHandlerInstance();
verify(nodeResourceUpdaterPlugin, times(1)).updateConfiguredResource(
any(Resource.class));
}
/*
* Make sure ResourcePluginManager is used to initialize ResourceHandlerChain
*/
@Test(timeout = 30000)
public void testLinuxContainerExecutorWithResourcePluginsEnabled() throws Exception {
final ResourcePluginManager rpm = stubResourcePluginmanager();
final LinuxContainerExecutor lce = new MyLCE();
nm = new NodeManager() {
@Override
protected NodeStatusUpdater createNodeStatusUpdater(Context context,
Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
((NMContext)context).setResourcePluginManager(rpm);
return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
metrics, new BaseResourceTrackerForTest());
}
@Override
protected ContainerManagerImpl createContainerManager(Context context,
ContainerExecutor exec, DeletionService del,
NodeStatusUpdater nodeStatusUpdater,
ApplicationACLsManager aclsManager,
LocalDirsHandlerService diskhandler) {
return new MyContainerManager(context, exec, del, nodeStatusUpdater,
metrics, diskhandler);
}
@Override
protected ContainerExecutor createContainerExecutor(Configuration conf) {
((NMContext)this.getNMContext()).setResourcePluginManager(rpm);
lce.setConf(conf);
return lce;
}
};
YarnConfiguration conf = createNMConfig();
nm.init(conf);
nm.start();
ResourceHandler handler = lce.getResourceHandler();
Assert.assertNotNull(handler);
Assert.assertTrue(handler instanceof ResourceHandlerChain);
boolean newHandlerAdded = false;
for (ResourceHandler h : ((ResourceHandlerChain) handler)
.getResourceHandlerList()) {
if (h instanceof CustomizedResourceHandler) {
newHandlerAdded = true;
break;
}
}
Assert.assertTrue("New ResourceHandler should be added", newHandlerAdded);
}
}

View File

@ -0,0 +1,123 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
public class TestGpuDiscoverer {
private String getTestParentFolder() {
File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
return f.getAbsolutePath();
}
private void touchFile(File f) throws IOException {
new FileOutputStream(f).close();
}
@Before
public void before() throws IOException {
String folder = getTestParentFolder();
File f = new File(folder);
FileUtils.deleteDirectory(f);
f.mkdirs();
}
@Test
public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
// Only run this on demand.
Assume.assumeTrue(Boolean.valueOf(
System.getProperty("RunLinuxGpuResourceDiscoverPluginConfigTest")));
// test case 1, check default setting.
Configuration conf = new Configuration(false);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
plugin.getPathOfGpuBinary());
Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
Assert.assertTrue(
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
// test case 2, check mandatory set path.
File fakeBinary = new File(getTestParentFolder(),
GpuDiscoverer.DEFAULT_BINARY_NAME);
touchFile(fakeBinary);
conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
plugin = new GpuDiscoverer();
plugin.initialize(conf);
Assert.assertEquals(fakeBinary.getAbsolutePath(),
plugin.getPathOfGpuBinary());
Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
// test case 3, check mandatory set path, but binary doesn't exist so default
// path will be used.
fakeBinary.delete();
plugin = new GpuDiscoverer();
plugin.initialize(conf);
Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
plugin.getPathOfGpuBinary());
Assert.assertTrue(
plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
}
@Test
public void testGpuDiscover() throws YarnException {
// Since this is more of a performance unit test, only run if
// RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
Assume.assumeTrue(
Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
Configuration conf = new Configuration(false);
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
GpuDeviceInformation info = plugin.getGpuDeviceInformation();
Assert.assertTrue(info.getGpus().size() > 0);
Assert.assertEquals(plugin.getMinorNumbersOfGpusUsableByYarn().size(),
info.getGpus().size());
}
@Test
public void getNumberOfUsableGpusFromConfig() throws YarnException {
Configuration conf = new Configuration(false);
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,2,4");
GpuDiscoverer plugin = new GpuDiscoverer();
plugin.initialize(conf);
List<Integer> minorNumbers = plugin.getMinorNumbersOfGpusUsableByYarn();
Assert.assertEquals(4, minorNumbers.size());
Assert.assertTrue(0 == minorNumbers.get(0));
Assert.assertTrue(1 == minorNumbers.get(1));
Assert.assertTrue(2 == minorNumbers.get(2));
Assert.assertTrue(4 == minorNumbers.get(3));
}
}

View File

@ -0,0 +1,50 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.junit.Assert;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
public class TestGpuDeviceInformationParser {
@Test
public void testParse() throws IOException, YarnException {
File f = new File("src/test/resources/nvidia-smi-sample-xml-output");
String s = FileUtils.readFileToString(f, "UTF-8");
GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
GpuDeviceInformation info = parser.parseXml(s);
Assert.assertEquals("375.66", info.getDriverVersion());
Assert.assertEquals(2, info.getGpus().size());
PerGpuDeviceInformation gpu1 = info.getGpus().get(1);
Assert.assertEquals("Tesla P100-PCIE-12GB", gpu1.getProductName());
Assert.assertEquals(16384, gpu1.getGpuMemoryUsage().getTotalMemoryMiB());
Assert.assertEquals(10.3f,
gpu1.getGpuUtilizations().getOverallGpuUtilization(), 1e-6);
Assert.assertEquals(34f, gpu1.getTemperature().getCurrentGpuTemp(), 1e-6);
Assert.assertEquals(85f, gpu1.getTemperature().getMaxGpuTemp(), 1e-6);
Assert.assertEquals(82f, gpu1.getTemperature().getSlowThresholdGpuTemp(),
1e-6);
}
}

View File

@ -0,0 +1,547 @@
<?xml version="1.0" ?>
<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v8.dtd">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<nvidia_smi_log>
<timestamp>Wed Sep 6 21:52:51 2017</timestamp>
<driver_version>375.66</driver_version>
<attached_gpus>2</attached_gpus>
<gpu id="0000:04:00.0">
<product_name>Tesla P100-PCIE-12GB</product_name>
<product_brand>Tesla</product_brand>
<display_mode>Disabled</display_mode>
<display_active>Disabled</display_active>
<persistence_mode>Disabled</persistence_mode>
<accounting_mode>Disabled</accounting_mode>
<accounting_mode_buffer_size>1920</accounting_mode_buffer_size>
<driver_model>
<current_dm>N/A</current_dm>
<pending_dm>N/A</pending_dm>
</driver_model>
<serial>0320717030197</serial>
<uuid>GPU-28604e81-21ec-cc48-6759-bf2648b22e16</uuid>
<minor_number>0</minor_number>
<vbios_version>86.00.3A.00.02</vbios_version>
<multigpu_board>No</multigpu_board>
<board_id>0x400</board_id>
<gpu_part_number>900-2H400-0110-030</gpu_part_number>
<inforom_version>
<img_version>H400.0202.00.01</img_version>
<oem_object>1.1</oem_object>
<ecc_object>4.1</ecc_object>
<pwr_object>N/A</pwr_object>
</inforom_version>
<gpu_operation_mode>
<current_gom>N/A</current_gom>
<pending_gom>N/A</pending_gom>
</gpu_operation_mode>
<gpu_virtualization_mode>
<virtualization_mode>None</virtualization_mode>
</gpu_virtualization_mode>
<pci>
<pci_bus>04</pci_bus>
<pci_device>00</pci_device>
<pci_domain>0000</pci_domain>
<pci_device_id>15F710DE</pci_device_id>
<pci_bus_id>0000:04:00.0</pci_bus_id>
<pci_sub_system_id>11DA10DE</pci_sub_system_id>
<pci_gpu_link_info>
<pcie_gen>
<max_link_gen>3</max_link_gen>
<current_link_gen>3</current_link_gen>
</pcie_gen>
<link_widths>
<max_link_width>16x</max_link_width>
<current_link_width>16x</current_link_width>
</link_widths>
</pci_gpu_link_info>
<pci_bridge_chip>
<bridge_chip_type>N/A</bridge_chip_type>
<bridge_chip_fw>N/A</bridge_chip_fw>
</pci_bridge_chip>
<replay_counter>0</replay_counter>
<tx_util>0 KB/s</tx_util>
<rx_util>0 KB/s</rx_util>
</pci>
<fan_speed>N/A</fan_speed>
<performance_state>P0</performance_state>
<clocks_throttle_reasons>
<clocks_throttle_reason_gpu_idle>Active</clocks_throttle_reason_gpu_idle>
<clocks_throttle_reason_applications_clocks_setting>Not Active</clocks_throttle_reason_applications_clocks_setting>
<clocks_throttle_reason_sw_power_cap>Not Active</clocks_throttle_reason_sw_power_cap>
<clocks_throttle_reason_hw_slowdown>Not Active</clocks_throttle_reason_hw_slowdown>
<clocks_throttle_reason_sync_boost>Not Active</clocks_throttle_reason_sync_boost>
<clocks_throttle_reason_unknown>Not Active</clocks_throttle_reason_unknown>
</clocks_throttle_reasons>
<fb_memory_usage>
<total>12193 MiB</total>
<used>0 MiB</used>
<free>12193 MiB</free>
</fb_memory_usage>
<bar1_memory_usage>
<total>16384 MiB</total>
<used>2 MiB</used>
<free>16382 MiB</free>
</bar1_memory_usage>
<compute_mode>Default</compute_mode>
<utilization>
<gpu_util>0 %</gpu_util>
<memory_util>0 %</memory_util>
<encoder_util>0 %</encoder_util>
<decoder_util>0 %</decoder_util>
</utilization>
<encoder_stats>
<session_count>0</session_count>
<average_fps>0</average_fps>
<average_latency>0 ms</average_latency>
</encoder_stats>
<ecc_mode>
<current_ecc>Enabled</current_ecc>
<pending_ecc>Enabled</pending_ecc>
</ecc_mode>
<ecc_errors>
<volatile>
<single_bit>
<device_memory>0</device_memory>
<register_file>0</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>0</l2_cache>
<texture_memory>0</texture_memory>
<texture_shm>0</texture_shm>
<total>0</total>
</single_bit>
<double_bit>
<device_memory>0</device_memory>
<register_file>0</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>0</l2_cache>
<texture_memory>0</texture_memory>
<texture_shm>0</texture_shm>
<total>0</total>
</double_bit>
</volatile>
<aggregate>
<single_bit>
<device_memory>0</device_memory>
<register_file>0</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>0</l2_cache>
<texture_memory>0</texture_memory>
<texture_shm>0</texture_shm>
<total>0</total>
</single_bit>
<double_bit>
<device_memory>0</device_memory>
<register_file>0</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>0</l2_cache>
<texture_memory>0</texture_memory>
<texture_shm>0</texture_shm>
<total>0</total>
</double_bit>
</aggregate>
</ecc_errors>
<retired_pages>
<multiple_single_bit_retirement>
<retired_count>0</retired_count>
<retired_page_addresses>
</retired_page_addresses>
</multiple_single_bit_retirement>
<double_bit_retirement>
<retired_count>0</retired_count>
<retired_page_addresses>
</retired_page_addresses>
</double_bit_retirement>
<pending_retirement>No</pending_retirement>
</retired_pages>
<temperature>
<gpu_temp>31 C</gpu_temp>
<gpu_temp_max_threshold>85 C</gpu_temp_max_threshold>
<gpu_temp_slow_threshold>82 C</gpu_temp_slow_threshold>
</temperature>
<power_readings>
<power_state>P0</power_state>
<power_management>Supported</power_management>
<power_draw>24.84 W</power_draw>
<power_limit>250.00 W</power_limit>
<default_power_limit>250.00 W</default_power_limit>
<enforced_power_limit>250.00 W</enforced_power_limit>
<min_power_limit>125.00 W</min_power_limit>
<max_power_limit>250.00 W</max_power_limit>
</power_readings>
<clocks>
<graphics_clock>405 MHz</graphics_clock>
<sm_clock>405 MHz</sm_clock>
<mem_clock>715 MHz</mem_clock>
<video_clock>835 MHz</video_clock>
</clocks>
<applications_clocks>
<graphics_clock>1189 MHz</graphics_clock>
<mem_clock>715 MHz</mem_clock>
</applications_clocks>
<default_applications_clocks>
<graphics_clock>1189 MHz</graphics_clock>
<mem_clock>715 MHz</mem_clock>
</default_applications_clocks>
<max_clocks>
<graphics_clock>1328 MHz</graphics_clock>
<sm_clock>1328 MHz</sm_clock>
<mem_clock>715 MHz</mem_clock>
<video_clock>1328 MHz</video_clock>
</max_clocks>
<clock_policy>
<auto_boost>N/A</auto_boost>
<auto_boost_default>N/A</auto_boost_default>
</clock_policy>
<supported_clocks>
<supported_mem_clock>
<value>715 MHz</value>
<supported_graphics_clock>1328 MHz</supported_graphics_clock>
<supported_graphics_clock>1316 MHz</supported_graphics_clock>
<supported_graphics_clock>1303 MHz</supported_graphics_clock>
<supported_graphics_clock>1290 MHz</supported_graphics_clock>
<supported_graphics_clock>1278 MHz</supported_graphics_clock>
<supported_graphics_clock>1265 MHz</supported_graphics_clock>
<supported_graphics_clock>1252 MHz</supported_graphics_clock>
<supported_graphics_clock>1240 MHz</supported_graphics_clock>
<supported_graphics_clock>1227 MHz</supported_graphics_clock>
<supported_graphics_clock>1215 MHz</supported_graphics_clock>
<supported_graphics_clock>1202 MHz</supported_graphics_clock>
<supported_graphics_clock>1189 MHz</supported_graphics_clock>
<supported_graphics_clock>1177 MHz</supported_graphics_clock>
<supported_graphics_clock>1164 MHz</supported_graphics_clock>
<supported_graphics_clock>1151 MHz</supported_graphics_clock>
<supported_graphics_clock>1139 MHz</supported_graphics_clock>
<supported_graphics_clock>1126 MHz</supported_graphics_clock>
<supported_graphics_clock>1113 MHz</supported_graphics_clock>
<supported_graphics_clock>1101 MHz</supported_graphics_clock>
<supported_graphics_clock>1088 MHz</supported_graphics_clock>
<supported_graphics_clock>1075 MHz</supported_graphics_clock>
<supported_graphics_clock>1063 MHz</supported_graphics_clock>
<supported_graphics_clock>1050 MHz</supported_graphics_clock>
<supported_graphics_clock>1037 MHz</supported_graphics_clock>
<supported_graphics_clock>1025 MHz</supported_graphics_clock>
<supported_graphics_clock>1012 MHz</supported_graphics_clock>
<supported_graphics_clock>999 MHz</supported_graphics_clock>
<supported_graphics_clock>987 MHz</supported_graphics_clock>
<supported_graphics_clock>974 MHz</supported_graphics_clock>
<supported_graphics_clock>961 MHz</supported_graphics_clock>
<supported_graphics_clock>949 MHz</supported_graphics_clock>
<supported_graphics_clock>936 MHz</supported_graphics_clock>
<supported_graphics_clock>923 MHz</supported_graphics_clock>
<supported_graphics_clock>911 MHz</supported_graphics_clock>
<supported_graphics_clock>898 MHz</supported_graphics_clock>
<supported_graphics_clock>885 MHz</supported_graphics_clock>
<supported_graphics_clock>873 MHz</supported_graphics_clock>
<supported_graphics_clock>860 MHz</supported_graphics_clock>
<supported_graphics_clock>847 MHz</supported_graphics_clock>
<supported_graphics_clock>835 MHz</supported_graphics_clock>
<supported_graphics_clock>822 MHz</supported_graphics_clock>
<supported_graphics_clock>810 MHz</supported_graphics_clock>
<supported_graphics_clock>797 MHz</supported_graphics_clock>
<supported_graphics_clock>784 MHz</supported_graphics_clock>
<supported_graphics_clock>772 MHz</supported_graphics_clock>
<supported_graphics_clock>759 MHz</supported_graphics_clock>
<supported_graphics_clock>746 MHz</supported_graphics_clock>
<supported_graphics_clock>734 MHz</supported_graphics_clock>
<supported_graphics_clock>721 MHz</supported_graphics_clock>
<supported_graphics_clock>708 MHz</supported_graphics_clock>
<supported_graphics_clock>696 MHz</supported_graphics_clock>
<supported_graphics_clock>683 MHz</supported_graphics_clock>
<supported_graphics_clock>670 MHz</supported_graphics_clock>
<supported_graphics_clock>658 MHz</supported_graphics_clock>
<supported_graphics_clock>645 MHz</supported_graphics_clock>
<supported_graphics_clock>632 MHz</supported_graphics_clock>
<supported_graphics_clock>620 MHz</supported_graphics_clock>
<supported_graphics_clock>607 MHz</supported_graphics_clock>
<supported_graphics_clock>594 MHz</supported_graphics_clock>
<supported_graphics_clock>582 MHz</supported_graphics_clock>
<supported_graphics_clock>569 MHz</supported_graphics_clock>
<supported_graphics_clock>556 MHz</supported_graphics_clock>
<supported_graphics_clock>544 MHz</supported_graphics_clock>
</supported_mem_clock>
</supported_clocks>
<processes>
</processes>
<accounted_processes>
</accounted_processes>
</gpu>
<gpu id="0000:82:00.0">
<product_name>Tesla P100-PCIE-12GB</product_name>
<product_brand>Tesla</product_brand>
<display_mode>Disabled</display_mode>
<display_active>Disabled</display_active>
<persistence_mode>Disabled</persistence_mode>
<accounting_mode>Disabled</accounting_mode>
<accounting_mode_buffer_size>1920</accounting_mode_buffer_size>
<driver_model>
<current_dm>N/A</current_dm>
<pending_dm>N/A</pending_dm>
</driver_model>
<serial>0320717031755</serial>
<uuid>GPU-46915a82-3fd2-8e11-ae26-a80b607c04f3</uuid>
<minor_number>1</minor_number>
<vbios_version>86.00.3A.00.02</vbios_version>
<multigpu_board>No</multigpu_board>
<board_id>0x8200</board_id>
<gpu_part_number>900-2H400-0110-030</gpu_part_number>
<inforom_version>
<img_version>H400.0202.00.01</img_version>
<oem_object>1.1</oem_object>
<ecc_object>4.1</ecc_object>
<pwr_object>N/A</pwr_object>
</inforom_version>
<gpu_operation_mode>
<current_gom>N/A</current_gom>
<pending_gom>N/A</pending_gom>
</gpu_operation_mode>
<gpu_virtualization_mode>
<virtualization_mode>None</virtualization_mode>
</gpu_virtualization_mode>
<pci>
<pci_bus>82</pci_bus>
<pci_device>00</pci_device>
<pci_domain>0000</pci_domain>
<pci_device_id>15F710DE</pci_device_id>
<pci_bus_id>0000:82:00.0</pci_bus_id>
<pci_sub_system_id>11DA10DE</pci_sub_system_id>
<pci_gpu_link_info>
<pcie_gen>
<max_link_gen>3</max_link_gen>
<current_link_gen>3</current_link_gen>
</pcie_gen>
<link_widths>
<max_link_width>16x</max_link_width>
<current_link_width>16x</current_link_width>
</link_widths>
</pci_gpu_link_info>
<pci_bridge_chip>
<bridge_chip_type>N/A</bridge_chip_type>
<bridge_chip_fw>N/A</bridge_chip_fw>
</pci_bridge_chip>
<replay_counter>0</replay_counter>
<tx_util>0 KB/s</tx_util>
<rx_util>0 KB/s</rx_util>
</pci>
<fan_speed>N/A</fan_speed>
<performance_state>P0</performance_state>
<clocks_throttle_reasons>
<clocks_throttle_reason_gpu_idle>Active</clocks_throttle_reason_gpu_idle>
<clocks_throttle_reason_applications_clocks_setting>Not Active</clocks_throttle_reason_applications_clocks_setting>
<clocks_throttle_reason_sw_power_cap>Not Active</clocks_throttle_reason_sw_power_cap>
<clocks_throttle_reason_hw_slowdown>Not Active</clocks_throttle_reason_hw_slowdown>
<clocks_throttle_reason_sync_boost>Not Active</clocks_throttle_reason_sync_boost>
<clocks_throttle_reason_unknown>Not Active</clocks_throttle_reason_unknown>
</clocks_throttle_reasons>
<fb_memory_usage>
<total>12193 MiB</total>
<used>0 MiB</used>
<free>12193 MiB</free>
</fb_memory_usage>
<bar1_memory_usage>
<total>16384 MiB</total>
<used>2 MiB</used>
<free>16382 MiB</free>
</bar1_memory_usage>
<compute_mode>Default</compute_mode>
<utilization>
<gpu_util>10.3 %</gpu_util>
<memory_util>0 %</memory_util>
<encoder_util>0 %</encoder_util>
<decoder_util>0 %</decoder_util>
</utilization>
<encoder_stats>
<session_count>0</session_count>
<average_fps>0</average_fps>
<average_latency>0 ms</average_latency>
</encoder_stats>
<ecc_mode>
<current_ecc>Enabled</current_ecc>
<pending_ecc>Enabled</pending_ecc>
</ecc_mode>
<ecc_errors>
<volatile>
<single_bit>
<device_memory>0</device_memory>
<register_file>0</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>0</l2_cache>
<texture_memory>0</texture_memory>
<texture_shm>0</texture_shm>
<total>0</total>
</single_bit>
<double_bit>
<device_memory>0</device_memory>
<register_file>0</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>0</l2_cache>
<texture_memory>0</texture_memory>
<texture_shm>0</texture_shm>
<total>0</total>
</double_bit>
</volatile>
<aggregate>
<single_bit>
<device_memory>0</device_memory>
<register_file>0</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>0</l2_cache>
<texture_memory>0</texture_memory>
<texture_shm>0</texture_shm>
<total>0</total>
</single_bit>
<double_bit>
<device_memory>0</device_memory>
<register_file>0</register_file>
<l1_cache>N/A</l1_cache>
<l2_cache>0</l2_cache>
<texture_memory>0</texture_memory>
<texture_shm>0</texture_shm>
<total>0</total>
</double_bit>
</aggregate>
</ecc_errors>
<retired_pages>
<multiple_single_bit_retirement>
<retired_count>0</retired_count>
<retired_page_addresses>
</retired_page_addresses>
</multiple_single_bit_retirement>
<double_bit_retirement>
<retired_count>0</retired_count>
<retired_page_addresses>
</retired_page_addresses>
</double_bit_retirement>
<pending_retirement>No</pending_retirement>
</retired_pages>
<temperature>
<gpu_temp>34 C</gpu_temp>
<gpu_temp_max_threshold>85 C</gpu_temp_max_threshold>
<gpu_temp_slow_threshold>82 C</gpu_temp_slow_threshold>
</temperature>
<power_readings>
<power_state>P0</power_state>
<power_management>Supported</power_management>
<power_draw>25.54 W</power_draw>
<power_limit>250.00 W</power_limit>
<default_power_limit>250.00 W</default_power_limit>
<enforced_power_limit>250.00 W</enforced_power_limit>
<min_power_limit>125.00 W</min_power_limit>
<max_power_limit>250.00 W</max_power_limit>
</power_readings>
<clocks>
<graphics_clock>405 MHz</graphics_clock>
<sm_clock>405 MHz</sm_clock>
<mem_clock>715 MHz</mem_clock>
<video_clock>835 MHz</video_clock>
</clocks>
<applications_clocks>
<graphics_clock>1189 MHz</graphics_clock>
<mem_clock>715 MHz</mem_clock>
</applications_clocks>
<default_applications_clocks>
<graphics_clock>1189 MHz</graphics_clock>
<mem_clock>715 MHz</mem_clock>
</default_applications_clocks>
<max_clocks>
<graphics_clock>1328 MHz</graphics_clock>
<sm_clock>1328 MHz</sm_clock>
<mem_clock>715 MHz</mem_clock>
<video_clock>1328 MHz</video_clock>
</max_clocks>
<clock_policy>
<auto_boost>N/A</auto_boost>
<auto_boost_default>N/A</auto_boost_default>
</clock_policy>
<supported_clocks>
<supported_mem_clock>
<value>715 MHz</value>
<supported_graphics_clock>1328 MHz</supported_graphics_clock>
<supported_graphics_clock>1316 MHz</supported_graphics_clock>
<supported_graphics_clock>1303 MHz</supported_graphics_clock>
<supported_graphics_clock>1290 MHz</supported_graphics_clock>
<supported_graphics_clock>1278 MHz</supported_graphics_clock>
<supported_graphics_clock>1265 MHz</supported_graphics_clock>
<supported_graphics_clock>1252 MHz</supported_graphics_clock>
<supported_graphics_clock>1240 MHz</supported_graphics_clock>
<supported_graphics_clock>1227 MHz</supported_graphics_clock>
<supported_graphics_clock>1215 MHz</supported_graphics_clock>
<supported_graphics_clock>1202 MHz</supported_graphics_clock>
<supported_graphics_clock>1189 MHz</supported_graphics_clock>
<supported_graphics_clock>1177 MHz</supported_graphics_clock>
<supported_graphics_clock>1164 MHz</supported_graphics_clock>
<supported_graphics_clock>1151 MHz</supported_graphics_clock>
<supported_graphics_clock>1139 MHz</supported_graphics_clock>
<supported_graphics_clock>1126 MHz</supported_graphics_clock>
<supported_graphics_clock>1113 MHz</supported_graphics_clock>
<supported_graphics_clock>1101 MHz</supported_graphics_clock>
<supported_graphics_clock>1088 MHz</supported_graphics_clock>
<supported_graphics_clock>1075 MHz</supported_graphics_clock>
<supported_graphics_clock>1063 MHz</supported_graphics_clock>
<supported_graphics_clock>1050 MHz</supported_graphics_clock>
<supported_graphics_clock>1037 MHz</supported_graphics_clock>
<supported_graphics_clock>1025 MHz</supported_graphics_clock>
<supported_graphics_clock>1012 MHz</supported_graphics_clock>
<supported_graphics_clock>999 MHz</supported_graphics_clock>
<supported_graphics_clock>987 MHz</supported_graphics_clock>
<supported_graphics_clock>974 MHz</supported_graphics_clock>
<supported_graphics_clock>961 MHz</supported_graphics_clock>
<supported_graphics_clock>949 MHz</supported_graphics_clock>
<supported_graphics_clock>936 MHz</supported_graphics_clock>
<supported_graphics_clock>923 MHz</supported_graphics_clock>
<supported_graphics_clock>911 MHz</supported_graphics_clock>
<supported_graphics_clock>898 MHz</supported_graphics_clock>
<supported_graphics_clock>885 MHz</supported_graphics_clock>
<supported_graphics_clock>873 MHz</supported_graphics_clock>
<supported_graphics_clock>860 MHz</supported_graphics_clock>
<supported_graphics_clock>847 MHz</supported_graphics_clock>
<supported_graphics_clock>835 MHz</supported_graphics_clock>
<supported_graphics_clock>822 MHz</supported_graphics_clock>
<supported_graphics_clock>810 MHz</supported_graphics_clock>
<supported_graphics_clock>797 MHz</supported_graphics_clock>
<supported_graphics_clock>784 MHz</supported_graphics_clock>
<supported_graphics_clock>772 MHz</supported_graphics_clock>
<supported_graphics_clock>759 MHz</supported_graphics_clock>
<supported_graphics_clock>746 MHz</supported_graphics_clock>
<supported_graphics_clock>734 MHz</supported_graphics_clock>
<supported_graphics_clock>721 MHz</supported_graphics_clock>
<supported_graphics_clock>708 MHz</supported_graphics_clock>
<supported_graphics_clock>696 MHz</supported_graphics_clock>
<supported_graphics_clock>683 MHz</supported_graphics_clock>
<supported_graphics_clock>670 MHz</supported_graphics_clock>
<supported_graphics_clock>658 MHz</supported_graphics_clock>
<supported_graphics_clock>645 MHz</supported_graphics_clock>
<supported_graphics_clock>632 MHz</supported_graphics_clock>
<supported_graphics_clock>620 MHz</supported_graphics_clock>
<supported_graphics_clock>607 MHz</supported_graphics_clock>
<supported_graphics_clock>594 MHz</supported_graphics_clock>
<supported_graphics_clock>582 MHz</supported_graphics_clock>
<supported_graphics_clock>569 MHz</supported_graphics_clock>
<supported_graphics_clock>556 MHz</supported_graphics_clock>
<supported_graphics_clock>544 MHz</supported_graphics_clock>
</supported_mem_clock>
</supported_clocks>
<processes>
</processes>
<accounted_processes>
</accounted_processes>
</gpu>
</nvidia_smi_log>