YARN-9174. Backport YARN-7224 for refactoring of GpuDevice class
This commit is contained in:
parent
7ec4d7c6ce
commit
16faceb0da
|
@ -26,12 +26,11 @@ import org.apache.commons.logging.LogFactory;
|
|||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
@ -54,8 +53,8 @@ import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
|
|||
public class GpuResourceAllocator {
|
||||
final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class);
|
||||
|
||||
private Set<Integer> allowedGpuDevices = new TreeSet<>();
|
||||
private Map<Integer, ContainerId> usedDevices = new TreeMap<>();
|
||||
private Set<GpuDevice> allowedGpuDevices = new TreeSet<>();
|
||||
private Map<GpuDevice, ContainerId> usedDevices = new TreeMap<>();
|
||||
private Context nmContext;
|
||||
|
||||
public GpuResourceAllocator(Context ctx) {
|
||||
|
@ -63,14 +62,14 @@ public class GpuResourceAllocator {
|
|||
}
|
||||
|
||||
/**
|
||||
* Contains allowed and denied devices with minor number.
|
||||
* Contains allowed and denied devices
|
||||
* Denied devices will be useful for cgroups devices module to do blacklisting
|
||||
*/
|
||||
static class GpuAllocation {
|
||||
private Set<Integer> allowed = Collections.emptySet();
|
||||
private Set<Integer> denied = Collections.emptySet();
|
||||
private Set<GpuDevice> allowed = Collections.emptySet();
|
||||
private Set<GpuDevice> denied = Collections.emptySet();
|
||||
|
||||
GpuAllocation(Set<Integer> allowed, Set<Integer> denied) {
|
||||
GpuAllocation(Set<GpuDevice> allowed, Set<GpuDevice> denied) {
|
||||
if (allowed != null) {
|
||||
this.allowed = ImmutableSet.copyOf(allowed);
|
||||
}
|
||||
|
@ -79,21 +78,21 @@ public class GpuResourceAllocator {
|
|||
}
|
||||
}
|
||||
|
||||
public Set<Integer> getAllowedGPUs() {
|
||||
public Set<GpuDevice> getAllowedGPUs() {
|
||||
return allowed;
|
||||
}
|
||||
|
||||
public Set<Integer> getDeniedGPUs() {
|
||||
public Set<GpuDevice> getDeniedGPUs() {
|
||||
return denied;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add GPU to allowed list
|
||||
* @param minorNumber minor number of the GPU device.
|
||||
* @param gpuDevice gpu device
|
||||
*/
|
||||
public synchronized void addGpu(int minorNumber) {
|
||||
allowedGpuDevices.add(minorNumber);
|
||||
public synchronized void addGpu(GpuDevice gpuDevice) {
|
||||
allowedGpuDevices.add(gpuDevice);
|
||||
}
|
||||
|
||||
private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices,
|
||||
|
@ -117,42 +116,42 @@ public class GpuResourceAllocator {
|
|||
+ containerId);
|
||||
}
|
||||
|
||||
for (Serializable deviceId : c.getResourceMappings().getAssignedResources(
|
||||
GPU_URI)){
|
||||
if (!(deviceId instanceof String)) {
|
||||
for (Serializable gpuDeviceSerializable : c.getResourceMappings()
|
||||
.getAssignedResources(GPU_URI)) {
|
||||
if (!(gpuDeviceSerializable instanceof GpuDevice)) {
|
||||
throw new ResourceHandlerException(
|
||||
"Trying to recover device id, however it"
|
||||
+ " is not String, this shouldn't happen");
|
||||
+ " is not GpuDevice, this shouldn't happen");
|
||||
}
|
||||
|
||||
|
||||
int devId;
|
||||
try {
|
||||
devId = Integer.parseInt((String)deviceId);
|
||||
} catch (NumberFormatException e) {
|
||||
throw new ResourceHandlerException("Failed to recover device id because"
|
||||
+ "it is not a valid integer, devId:" + deviceId);
|
||||
}
|
||||
GpuDevice gpuDevice = (GpuDevice) gpuDeviceSerializable;
|
||||
|
||||
// Make sure it is in allowed GPU device.
|
||||
if (!allowedGpuDevices.contains(devId)) {
|
||||
throw new ResourceHandlerException("Try to recover device id = " + devId
|
||||
if (!allowedGpuDevices.contains(gpuDevice)) {
|
||||
throw new ResourceHandlerException(
|
||||
"Try to recover device = " + gpuDevice
|
||||
+ " however it is not in allowed device list:" + StringUtils
|
||||
.join(",", allowedGpuDevices));
|
||||
}
|
||||
|
||||
// Make sure it is not occupied by anybody else
|
||||
if (usedDevices.containsKey(devId)) {
|
||||
throw new ResourceHandlerException("Try to recover device id = " + devId
|
||||
if (usedDevices.containsKey(gpuDevice)) {
|
||||
throw new ResourceHandlerException(
|
||||
"Try to recover device id = " + gpuDevice
|
||||
+ " however it is already assigned to container=" + usedDevices
|
||||
.get(devId) + ", please double check what happened.");
|
||||
.get(gpuDevice) + ", please double check what happened.");
|
||||
}
|
||||
|
||||
usedDevices.put(devId, containerId);
|
||||
usedDevices.put(gpuDevice, containerId);
|
||||
}
|
||||
}
|
||||
|
||||
private int getRequestedGpus(Resource requestedResource) {
|
||||
/**
|
||||
* Get number of requested GPUs from resource.
|
||||
* @param requestedResource requested resource
|
||||
* @return #gpus.
|
||||
*/
|
||||
public static int getRequestedGpus(Resource requestedResource) {
|
||||
try {
|
||||
return Long.valueOf(requestedResource.getResourceValue(
|
||||
GPU_URI)).intValue();
|
||||
|
@ -164,8 +163,8 @@ public class GpuResourceAllocator {
|
|||
/**
|
||||
* Assign GPU to requestor
|
||||
* @param container container to allocate
|
||||
* @return List of denied Gpus with minor numbers
|
||||
* @throws ResourceHandlerException When failed to
|
||||
* @return allocation results.
|
||||
* @throws ResourceHandlerException When failed to assign GPUs.
|
||||
*/
|
||||
public synchronized GpuAllocation assignGpus(Container container)
|
||||
throws ResourceHandlerException {
|
||||
|
@ -180,12 +179,12 @@ public class GpuResourceAllocator {
|
|||
containerId));
|
||||
}
|
||||
|
||||
Set<Integer> assignedGpus = new HashSet<>();
|
||||
Set<GpuDevice> assignedGpus = new TreeSet<>();
|
||||
|
||||
for (int deviceNum : allowedGpuDevices) {
|
||||
if (!usedDevices.containsKey(deviceNum)) {
|
||||
usedDevices.put(deviceNum, containerId);
|
||||
assignedGpus.add(deviceNum);
|
||||
for (GpuDevice gpu : allowedGpuDevices) {
|
||||
if (!usedDevices.containsKey(gpu)) {
|
||||
usedDevices.put(gpu, containerId);
|
||||
assignedGpus.add(gpu);
|
||||
if (assignedGpus.size() == numRequestedGpuDevices) {
|
||||
break;
|
||||
}
|
||||
|
@ -194,21 +193,10 @@ public class GpuResourceAllocator {
|
|||
|
||||
// Record in state store if we allocated anything
|
||||
if (!assignedGpus.isEmpty()) {
|
||||
List<Serializable> allocatedDevices = new ArrayList<>();
|
||||
for (int gpu : assignedGpus) {
|
||||
allocatedDevices.add(String.valueOf(gpu));
|
||||
}
|
||||
try {
|
||||
// Update Container#getResourceMapping.
|
||||
ResourceMappings.AssignedResources assignedResources =
|
||||
new ResourceMappings.AssignedResources();
|
||||
assignedResources.updateAssignedResources(allocatedDevices);
|
||||
container.getResourceMappings().addAssignedResources(GPU_URI,
|
||||
assignedResources);
|
||||
|
||||
// Update state store.
|
||||
nmContext.getNMStateStore().storeAssignedResources(containerId,
|
||||
GPU_URI, allocatedDevices);
|
||||
nmContext.getNMStateStore().storeAssignedResources(container, GPU_URI,
|
||||
new ArrayList<Serializable>(assignedGpus));
|
||||
} catch (IOException e) {
|
||||
cleanupAssignGpus(containerId);
|
||||
throw new ResourceHandlerException(e);
|
||||
|
@ -226,7 +214,7 @@ public class GpuResourceAllocator {
|
|||
* @param containerId containerId
|
||||
*/
|
||||
public synchronized void cleanupAssignGpus(ContainerId containerId) {
|
||||
Iterator<Map.Entry<Integer, ContainerId>> iter =
|
||||
Iterator<Map.Entry<GpuDevice, ContainerId>> iter =
|
||||
usedDevices.entrySet().iterator();
|
||||
while (iter.hasNext()) {
|
||||
if (iter.next().getValue().equals(containerId)) {
|
||||
|
@ -236,7 +224,7 @@ public class GpuResourceAllocator {
|
|||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public synchronized Map<Integer, ContainerId> getDeviceAllocationMapping() {
|
||||
public synchronized Map<GpuDevice, ContainerId> getDeviceAllocationMapping() {
|
||||
return new HashMap<>(usedDevices);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,8 +24,6 @@ import org.apache.commons.logging.LogFactory;
|
|||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
|
@ -35,6 +33,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileg
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
@ -64,17 +63,23 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
|
|||
@Override
|
||||
public List<PrivilegedOperation> bootstrap(Configuration configuration)
|
||||
throws ResourceHandlerException {
|
||||
List<Integer> minorNumbersOfUsableGpus;
|
||||
List<GpuDevice> usableGpus;
|
||||
try {
|
||||
minorNumbersOfUsableGpus = GpuDiscoverer.getInstance()
|
||||
.getMinorNumbersOfGpusUsableByYarn();
|
||||
usableGpus = GpuDiscoverer.getInstance()
|
||||
.getGpusUsableByYarn();
|
||||
if (usableGpus == null || usableGpus.isEmpty()) {
|
||||
String message = "GPU is enabled on the NodeManager, but couldn't find "
|
||||
+ "any usable GPU devices, please double check configuration.";
|
||||
LOG.error(message);
|
||||
throw new ResourceHandlerException(message);
|
||||
}
|
||||
} catch (YarnException e) {
|
||||
LOG.error("Exception when trying to get usable GPU device", e);
|
||||
throw new ResourceHandlerException(e);
|
||||
}
|
||||
|
||||
for (int minorNumber : minorNumbersOfUsableGpus) {
|
||||
gpuAllocator.addGpu(minorNumber);
|
||||
for (GpuDevice gpu : usableGpus) {
|
||||
gpuAllocator.addGpu(gpu);
|
||||
}
|
||||
|
||||
// And initialize cgroups
|
||||
|
@ -102,10 +107,13 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
|
|||
PrivilegedOperation.OperationType.GPU, Arrays
|
||||
.asList(CONTAINER_ID_CLI_OPTION, containerIdStr));
|
||||
if (!allocation.getDeniedGPUs().isEmpty()) {
|
||||
privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION,
|
||||
StringUtils.join(",", allocation.getDeniedGPUs())));
|
||||
List<Integer> minorNumbers = new ArrayList<>();
|
||||
for (GpuDevice deniedGpu : allocation.getDeniedGPUs()) {
|
||||
minorNumbers.add(deniedGpu.getMinorNumber());
|
||||
}
|
||||
privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION,
|
||||
StringUtils.join(",", minorNumbers)));
|
||||
}
|
||||
|
||||
privilegedOperationExecutor.executePrivilegedOperation(
|
||||
privilegedOperation, true);
|
||||
} catch (PrivilegedOperationException e) {
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* This class is used to represent GPU device while allocation.
|
||||
*/
|
||||
public class GpuDevice implements Serializable, Comparable {
|
||||
private int index;
|
||||
private int minorNumber;
|
||||
private static final long serialVersionUID = -6812314470754667710L;
|
||||
|
||||
public GpuDevice(int index, int minorNumber) {
|
||||
this.index = index;
|
||||
this.minorNumber = minorNumber;
|
||||
}
|
||||
|
||||
public int getIndex() {
|
||||
return index;
|
||||
}
|
||||
|
||||
public int getMinorNumber() {
|
||||
return minorNumber;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == null || !(obj instanceof GpuDevice)) {
|
||||
return false;
|
||||
}
|
||||
GpuDevice other = (GpuDevice) obj;
|
||||
return index == other.index && minorNumber == other.minorNumber;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Object obj) {
|
||||
if (obj == null || (!(obj instanceof GpuDevice))) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
GpuDevice other = (GpuDevice) obj;
|
||||
|
||||
int result = Integer.compare(index, other.index);
|
||||
if (0 != result) {
|
||||
return result;
|
||||
}
|
||||
return Integer.compare(minorNumber, other.minorNumber);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 47;
|
||||
return prime * index + minorNumber;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "(index=" + index + ",minor_number=" + minorNumber + ")";
|
||||
}
|
||||
}
|
|
@ -136,12 +136,12 @@ public class GpuDiscoverer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get list of minor device numbers of Gpu devices usable by YARN.
|
||||
* Get list of GPU devices usable by YARN.
|
||||
*
|
||||
* @return List of minor device numbers of Gpu devices.
|
||||
* @return List of GPU devices
|
||||
* @throws YarnException when any issue happens
|
||||
*/
|
||||
public synchronized List<Integer> getMinorNumbersOfGpusUsableByYarn()
|
||||
public synchronized List<GpuDevice> getGpusUsableByYarn()
|
||||
throws YarnException {
|
||||
validateConfOrThrowException();
|
||||
|
||||
|
@ -149,7 +149,7 @@ public class GpuDiscoverer {
|
|||
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
|
||||
|
||||
List<Integer> minorNumbers = new ArrayList<>();
|
||||
List<GpuDevice> gpuDevices = new ArrayList<>();
|
||||
|
||||
if (allowedDevicesStr.equals(
|
||||
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
|
||||
|
@ -167,21 +167,31 @@ public class GpuDiscoverer {
|
|||
}
|
||||
|
||||
if (lastDiscoveredGpuInformation.getGpus() != null) {
|
||||
for (PerGpuDeviceInformation gpu : lastDiscoveredGpuInformation
|
||||
.getGpus()) {
|
||||
minorNumbers.add(gpu.getMinorNumber());
|
||||
for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size();
|
||||
i++) {
|
||||
List<PerGpuDeviceInformation> gpuInfos =
|
||||
lastDiscoveredGpuInformation.getGpus();
|
||||
gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber()));
|
||||
}
|
||||
}
|
||||
} else{
|
||||
for (String s : allowedDevicesStr.split(",")) {
|
||||
if (s.trim().length() > 0) {
|
||||
minorNumbers.add(Integer.valueOf(s.trim()));
|
||||
}
|
||||
}
|
||||
LOG.info("Allowed GPU devices with minor numbers:" + allowedDevicesStr);
|
||||
String[] kv = s.trim().split(":");
|
||||
if (kv.length != 2) {
|
||||
throw new YarnException(
|
||||
"Illegal format, it should be index:minor_number format, now it="
|
||||
+ s);
|
||||
}
|
||||
|
||||
return minorNumbers;
|
||||
gpuDevices.add(
|
||||
new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1])));
|
||||
}
|
||||
}
|
||||
LOG.info("Allowed GPU devices:" + gpuDevices);
|
||||
}
|
||||
|
||||
return gpuDevices;
|
||||
}
|
||||
|
||||
public synchronized void initialize(Configuration conf) throws YarnException {
|
||||
|
|
|
@ -40,12 +40,14 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
|
|||
public void updateConfiguredResource(Resource res) throws YarnException {
|
||||
LOG.info("Initializing configured GPU resources for the NodeManager.");
|
||||
|
||||
List<Integer> usableGpus =
|
||||
GpuDiscoverer.getInstance().getMinorNumbersOfGpusUsableByYarn();
|
||||
List<GpuDevice> usableGpus =
|
||||
GpuDiscoverer.getInstance().getGpusUsableByYarn();
|
||||
if (null == usableGpus || usableGpus.isEmpty()) {
|
||||
LOG.info("Didn't find any usable GPUs on the NodeManager.");
|
||||
String message = "GPU is enabled, but couldn't find any usable GPUs on the "
|
||||
+ "NodeManager.";
|
||||
LOG.error(message);
|
||||
// No gpu can be used by YARN.
|
||||
return;
|
||||
throw new YarnException(message);
|
||||
}
|
||||
|
||||
long nUsableGpus = usableGpus.size();
|
||||
|
|
|
@ -18,28 +18,9 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.recovery;
|
||||
|
||||
import static org.fusesource.leveldbjni.JniDBFactory.asString;
|
||||
import static org.fusesource.leveldbjni.JniDBFactory.bytes;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.apache.hadoop.yarn.api.records.Token;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Timer;
|
||||
import java.util.TimerTask;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.collect.ArrayListMultimap;
|
||||
import com.google.common.collect.ListMultimap;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
@ -51,9 +32,11 @@ import org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestP
|
|||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.Token;
|
||||
import org.apache.hadoop.yarn.api.records.impl.pb.ResourcePBImpl;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.proto.YarnProtos.LocalResourceProto;
|
||||
import org.apache.hadoop.yarn.proto.YarnSecurityTokenProtos.ContainerTokenIdentifierProto;
|
||||
import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.MasterKeyProto;
|
||||
import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.VersionProto;
|
||||
import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.ContainerManagerApplicationProto;
|
||||
|
@ -61,9 +44,10 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.Deletion
|
|||
import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LocalizedResourceProto;
|
||||
import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto;
|
||||
import org.apache.hadoop.yarn.proto.YarnServiceProtos.StartContainerRequestProto;
|
||||
import org.apache.hadoop.yarn.proto.YarnSecurityTokenProtos.ContainerTokenIdentifierProto;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.records.MasterKey;
|
||||
import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
||||
import org.apache.hadoop.yarn.server.records.Version;
|
||||
import org.apache.hadoop.yarn.server.records.impl.pb.VersionPBImpl;
|
||||
|
@ -76,10 +60,26 @@ import org.iq80.leveldb.DB;
|
|||
import org.iq80.leveldb.DBException;
|
||||
import org.iq80.leveldb.Options;
|
||||
import org.iq80.leveldb.WriteBatch;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
import java.util.Timer;
|
||||
import java.util.TimerTask;
|
||||
|
||||
import static org.fusesource.leveldbjni.JniDBFactory.asString;
|
||||
import static org.fusesource.leveldbjni.JniDBFactory.bytes;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.collect.ArrayListMultimap;
|
||||
import com.google.common.collect.ListMultimap;
|
||||
|
||||
public class NMLeveldbStateStoreService extends NMStateStoreService {
|
||||
|
||||
|
@ -1180,15 +1180,18 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void storeAssignedResources(ContainerId containerId,
|
||||
public void storeAssignedResources(Container container,
|
||||
String resourceType, List<Serializable> assignedResources)
|
||||
throws IOException {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("storeAssignedResources: containerId=" + containerId
|
||||
+ ", assignedResources=" + StringUtils.join(",", assignedResources));
|
||||
LOG.debug(
|
||||
"storeAssignedResources: containerId=" + container.getContainerId()
|
||||
+ ", assignedResources=" + StringUtils
|
||||
.join(",", assignedResources));
|
||||
|
||||
}
|
||||
|
||||
String keyResChng = CONTAINERS_KEY_PREFIX + containerId.toString()
|
||||
String keyResChng = CONTAINERS_KEY_PREFIX + container.getContainerId().toString()
|
||||
+ CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX + resourceType;
|
||||
try {
|
||||
WriteBatch batch = db.createWriteBatch();
|
||||
|
@ -1206,6 +1209,9 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
|||
} catch (DBException e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
|
||||
// update container resource mapping.
|
||||
updateContainerResourceMapping(container, resourceType, assignedResources);
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.Localize
|
|||
import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.records.MasterKey;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
|
||||
// The state store to use when state isn't being stored
|
||||
public class NMNullStateStoreService extends NMStateStoreService {
|
||||
|
@ -268,7 +269,7 @@ public class NMNullStateStoreService extends NMStateStoreService {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void storeAssignedResources(ContainerId containerId,
|
||||
public void storeAssignedResources(Container container,
|
||||
String resourceType, List<Serializable> assignedResources)
|
||||
throws IOException {
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.Localize
|
|||
import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto;
|
||||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.records.MasterKey;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
||||
|
||||
@Private
|
||||
|
@ -732,12 +733,12 @@ public abstract class NMStateStoreService extends AbstractService {
|
|||
/**
|
||||
* Store the assigned resources to a container.
|
||||
*
|
||||
* @param containerId Container Id
|
||||
* @param container NMContainer
|
||||
* @param resourceType Resource Type
|
||||
* @param assignedResources Assigned resources
|
||||
* @throws IOException if fails
|
||||
*/
|
||||
public abstract void storeAssignedResources(ContainerId containerId,
|
||||
public abstract void storeAssignedResources(Container container,
|
||||
String resourceType, List<Serializable> assignedResources)
|
||||
throws IOException;
|
||||
|
||||
|
@ -746,4 +747,14 @@ public abstract class NMStateStoreService extends AbstractService {
|
|||
protected abstract void startStorage() throws IOException;
|
||||
|
||||
protected abstract void closeStorage() throws IOException;
|
||||
|
||||
protected void updateContainerResourceMapping(Container container,
|
||||
String resourceType, List<Serializable> assignedResources) {
|
||||
// Update Container#getResourceMapping.
|
||||
ResourceMappings.AssignedResources newAssigned =
|
||||
new ResourceMappings.AssignedResources();
|
||||
newAssigned.updateAssignedResources(assignedResources);
|
||||
container.getResourceMappings().addAssignedResources(resourceType,
|
||||
newAssigned);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -519,18 +519,20 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
|
|||
|
||||
commonLaunchContainer(appId, cid, cm);
|
||||
|
||||
Container nmContainer = context.getContainers().get(cid);
|
||||
|
||||
Application app = context.getApplications().get(appId);
|
||||
assertNotNull(app);
|
||||
|
||||
// store resource mapping of the container
|
||||
List<Serializable> gpuResources =
|
||||
Arrays.<Serializable>asList("1", "2", "3");
|
||||
stateStore.storeAssignedResources(cid, "gpu", gpuResources);
|
||||
stateStore.storeAssignedResources(nmContainer, "gpu", gpuResources);
|
||||
List<Serializable> numaResources = Arrays.<Serializable>asList("numa1");
|
||||
stateStore.storeAssignedResources(cid, "numa", numaResources);
|
||||
stateStore.storeAssignedResources(nmContainer, "numa", numaResources);
|
||||
List<Serializable> fpgaResources =
|
||||
Arrays.<Serializable>asList("fpga1", "fpga2");
|
||||
stateStore.storeAssignedResources(cid, "fpga", fpgaResources);
|
||||
stateStore.storeAssignedResources(nmContainer, "fpga", fpgaResources);
|
||||
|
||||
cm.stop();
|
||||
context = createContext(conf, stateStore);
|
||||
|
@ -542,7 +544,6 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
|
|||
app = context.getApplications().get(appId);
|
||||
assertNotNull(app);
|
||||
|
||||
Container nmContainer = context.getContainers().get(cid);
|
||||
Assert.assertNotNull(nmContainer);
|
||||
ResourceMappings resourceMappings = nmContainer.getResourceMappings();
|
||||
List<Serializable> assignedResource = resourceMappings
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
|
|||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
|
@ -36,9 +35,10 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileg
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
|
@ -46,6 +46,7 @@ import org.junit.Test;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
|
@ -92,7 +93,7 @@ public class TestGpuResourceHandler {
|
|||
@Test
|
||||
public void testBootStrap() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0");
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
|
||||
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
|
@ -106,8 +107,8 @@ public class TestGpuResourceHandler {
|
|||
.newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
|
||||
}
|
||||
|
||||
private static Container mockContainerWithGpuRequest(int id,
|
||||
int numGpuRequest) {
|
||||
private static Container mockContainerWithGpuRequest(int id, int numGpuRequest,
|
||||
boolean dockerContainerEnabled) {
|
||||
Container c = mock(Container.class);
|
||||
when(c.getContainerId()).thenReturn(getContainerId(id));
|
||||
|
||||
|
@ -117,29 +118,46 @@ public class TestGpuResourceHandler {
|
|||
res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
|
||||
when(c.getResource()).thenReturn(res);
|
||||
when(c.getResourceMappings()).thenReturn(resMapping);
|
||||
|
||||
ContainerLaunchContext clc = mock(ContainerLaunchContext.class);
|
||||
Map<String, String> env = new HashMap<>();
|
||||
if (dockerContainerEnabled) {
|
||||
env.put(ContainerRuntimeConstants.ENV_CONTAINER_TYPE, "docker");
|
||||
}
|
||||
when(clc.getEnvironment()).thenReturn(env);
|
||||
when(c.getLaunchContext()).thenReturn(clc);
|
||||
return c;
|
||||
}
|
||||
|
||||
private static Container mockContainerWithGpuRequest(int id,
|
||||
int numGpuRequest) {
|
||||
return mockContainerWithGpuRequest(id, numGpuRequest, false);
|
||||
}
|
||||
|
||||
private void verifyDeniedDevices(ContainerId containerId,
|
||||
List<Integer> deniedDevices)
|
||||
List<GpuDevice> deniedDevices)
|
||||
throws ResourceHandlerException, PrivilegedOperationException {
|
||||
verify(mockCGroupsHandler, times(1)).createCGroup(
|
||||
CGroupsHandler.CGroupController.DEVICES, containerId.toString());
|
||||
|
||||
if (null != deniedDevices && !deniedDevices.isEmpty()) {
|
||||
List<Integer> deniedDevicesMinorNumber = new ArrayList<>();
|
||||
for (GpuDevice deniedDevice : deniedDevices) {
|
||||
deniedDevicesMinorNumber.add(deniedDevice.getMinorNumber());
|
||||
}
|
||||
verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
|
||||
new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
|
||||
.asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
|
||||
containerId.toString(),
|
||||
GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
|
||||
StringUtils.join(",", deniedDevices))), true);
|
||||
StringUtils.join(",", deniedDevicesMinorNumber))), true);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllocation() throws Exception {
|
||||
private void commonTestAllocation(boolean dockerContainerEnabled)
|
||||
throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
|
@ -147,31 +165,55 @@ public class TestGpuResourceHandler {
|
|||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
|
||||
/* Start container 1, asks 3 containers */
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
|
||||
gpuResourceHandler.preStart(
|
||||
mockContainerWithGpuRequest(1, 3, dockerContainerEnabled));
|
||||
|
||||
// Only device=4 will be blocked.
|
||||
verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
|
||||
if (dockerContainerEnabled) {
|
||||
verifyDeniedDevices(getContainerId(1),
|
||||
Collections.<GpuDevice>emptyList());
|
||||
} else{
|
||||
verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3,4)));
|
||||
}
|
||||
|
||||
/* Start container 2, asks 2 containers. Excepted to fail */
|
||||
boolean failedToAllocate = false;
|
||||
try {
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2));
|
||||
gpuResourceHandler.preStart(
|
||||
mockContainerWithGpuRequest(2, 2, dockerContainerEnabled));
|
||||
} catch (ResourceHandlerException e) {
|
||||
failedToAllocate = true;
|
||||
}
|
||||
Assert.assertTrue(failedToAllocate);
|
||||
|
||||
/* Start container 3, ask 1 container, succeeded */
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1));
|
||||
gpuResourceHandler.preStart(
|
||||
mockContainerWithGpuRequest(3, 1, dockerContainerEnabled));
|
||||
|
||||
// devices = 0/1/3 will be blocked
|
||||
verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3));
|
||||
if (dockerContainerEnabled) {
|
||||
verifyDeniedDevices(getContainerId(3),
|
||||
Collections.<GpuDevice>emptyList());
|
||||
} else {
|
||||
verifyDeniedDevices(getContainerId(3), Arrays
|
||||
.asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
|
||||
new GpuDevice(2, 3)));
|
||||
}
|
||||
|
||||
|
||||
/* Start container 4, ask 0 container, succeeded */
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0));
|
||||
gpuResourceHandler.preStart(
|
||||
mockContainerWithGpuRequest(4, 0, dockerContainerEnabled));
|
||||
|
||||
if (dockerContainerEnabled) {
|
||||
verifyDeniedDevices(getContainerId(4),
|
||||
Collections.<GpuDevice>emptyList());
|
||||
} else{
|
||||
// All devices will be blocked
|
||||
verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4));
|
||||
verifyDeniedDevices(getContainerId(4), Arrays
|
||||
.asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
|
||||
new GpuDevice(3, 4)));
|
||||
}
|
||||
|
||||
/* Release container-1, expect cgroups deleted */
|
||||
gpuResourceHandler.postComplete(getContainerId(1));
|
||||
|
@ -190,12 +232,24 @@ public class TestGpuResourceHandler {
|
|||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllocationWhenDockerContainerEnabled() throws Exception {
|
||||
// When docker container is enabled, no devices should be written to
|
||||
// devices.deny.
|
||||
commonTestAllocation(true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllocation() throws Exception {
|
||||
commonTestAllocation(false);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
|
||||
throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
|
@ -204,7 +258,7 @@ public class TestGpuResourceHandler {
|
|||
|
||||
doThrow(new IOException("Exception ...")).when(mockNMStateStore)
|
||||
.storeAssignedResources(
|
||||
any(ContainerId.class), anyString(), anyList());
|
||||
any(Container.class), anyString(), anyList());
|
||||
|
||||
boolean exception = false;
|
||||
/* Start container 1, asks 3 containers */
|
||||
|
@ -227,13 +281,16 @@ public class TestGpuResourceHandler {
|
|||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
try {
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
Assert.assertEquals(0,
|
||||
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
|
||||
Assert.fail("Should fail because no GPU available");
|
||||
} catch (ResourceHandlerException e) {
|
||||
// Expected because of no resource available
|
||||
}
|
||||
|
||||
/* Start container 1, asks 0 containers */
|
||||
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
|
||||
verifyDeniedDevices(getContainerId(1), Collections.<Integer>emptyList());
|
||||
verifyDeniedDevices(getContainerId(1), Collections.<GpuDevice>emptyList());
|
||||
|
||||
/* Start container 2, asks 1 containers. Excepted to fail */
|
||||
boolean failedToAllocate = false;
|
||||
|
@ -256,7 +313,7 @@ public class TestGpuResourceHandler {
|
|||
@Test
|
||||
public void testAllocationStored() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
|
@ -267,34 +324,34 @@ public class TestGpuResourceHandler {
|
|||
Container container = mockContainerWithGpuRequest(1, 3);
|
||||
gpuResourceHandler.preStart(container);
|
||||
|
||||
verify(mockNMStateStore).storeAssignedResources(getContainerId(1),
|
||||
ResourceInformation.GPU_URI,
|
||||
Arrays.<Serializable>asList("0", "1", "3"));
|
||||
|
||||
Assert.assertEquals(3, container.getResourceMappings()
|
||||
.getAssignedResources(ResourceInformation.GPU_URI).size());
|
||||
verify(mockNMStateStore).storeAssignedResources(container,
|
||||
ResourceInformation.GPU_URI, Arrays
|
||||
.<Serializable>asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
|
||||
new GpuDevice(2, 3)));
|
||||
|
||||
// Only device=4 will be blocked.
|
||||
verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
|
||||
verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3, 4)));
|
||||
|
||||
/* Start container 2, ask 0 container, succeeded */
|
||||
container = mockContainerWithGpuRequest(2, 0);
|
||||
gpuResourceHandler.preStart(container);
|
||||
|
||||
verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4));
|
||||
verifyDeniedDevices(getContainerId(2), Arrays
|
||||
.asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
|
||||
new GpuDevice(3, 4)));
|
||||
Assert.assertEquals(0, container.getResourceMappings()
|
||||
.getAssignedResources(ResourceInformation.GPU_URI).size());
|
||||
|
||||
// Store assigned resource will not be invoked.
|
||||
verify(mockNMStateStore, never()).storeAssignedResources(
|
||||
eq(getContainerId(2)), eq(ResourceInformation.GPU_URI),
|
||||
eq(container), eq(ResourceInformation.GPU_URI),
|
||||
anyListOf(Serializable.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRecoverResourceAllocation() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
|
||||
GpuDiscoverer.getInstance().initialize(conf);
|
||||
|
||||
gpuResourceHandler.bootstrap(conf);
|
||||
|
@ -305,7 +362,8 @@ public class TestGpuResourceHandler {
|
|||
ResourceMappings rmap = new ResourceMappings();
|
||||
ResourceMappings.AssignedResources ar =
|
||||
new ResourceMappings.AssignedResources();
|
||||
ar.updateAssignedResources(Arrays.<Serializable>asList("1", "3"));
|
||||
ar.updateAssignedResources(
|
||||
Arrays.<Serializable>asList(new GpuDevice(1, 1), new GpuDevice(2, 3)));
|
||||
rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
|
||||
when(nmContainer.getResourceMappings()).thenReturn(rmap);
|
||||
|
||||
|
@ -315,12 +373,15 @@ public class TestGpuResourceHandler {
|
|||
// Reacquire container restore state of GPU Resource Allocator.
|
||||
gpuResourceHandler.reacquireContainer(getContainerId(1));
|
||||
|
||||
Map<Integer, ContainerId> deviceAllocationMapping =
|
||||
Map<GpuDevice, ContainerId> deviceAllocationMapping =
|
||||
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
|
||||
Assert.assertEquals(2, deviceAllocationMapping.size());
|
||||
Assert.assertTrue(
|
||||
deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
|
||||
Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
|
||||
deviceAllocationMapping.keySet().contains(new GpuDevice(1, 1)));
|
||||
Assert.assertTrue(
|
||||
deviceAllocationMapping.keySet().contains(new GpuDevice(2, 3)));
|
||||
Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
|
||||
getContainerId(1));
|
||||
|
||||
// TEST CASE
|
||||
// Try to reacquire a container but requested device is not in allowed list.
|
||||
|
@ -328,7 +389,8 @@ public class TestGpuResourceHandler {
|
|||
rmap = new ResourceMappings();
|
||||
ar = new ResourceMappings.AssignedResources();
|
||||
// id=5 is not in allowed list.
|
||||
ar.updateAssignedResources(Arrays.<Serializable>asList("4", "5"));
|
||||
ar.updateAssignedResources(
|
||||
Arrays.<Serializable>asList(new GpuDevice(3, 4), new GpuDevice(4, 5)));
|
||||
rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
|
||||
when(nmContainer.getResourceMappings()).thenReturn(rmap);
|
||||
|
||||
|
@ -348,9 +410,10 @@ public class TestGpuResourceHandler {
|
|||
deviceAllocationMapping =
|
||||
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
|
||||
Assert.assertEquals(2, deviceAllocationMapping.size());
|
||||
Assert.assertTrue(
|
||||
deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
|
||||
Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
|
||||
Assert.assertTrue(deviceAllocationMapping.keySet()
|
||||
.containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
|
||||
Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
|
||||
getContainerId(1));
|
||||
|
||||
// TEST CASE
|
||||
// Try to reacquire a container but requested device is already assigned.
|
||||
|
@ -358,7 +421,8 @@ public class TestGpuResourceHandler {
|
|||
rmap = new ResourceMappings();
|
||||
ar = new ResourceMappings.AssignedResources();
|
||||
// id=3 is already assigned
|
||||
ar.updateAssignedResources(Arrays.<Serializable>asList("4", "3"));
|
||||
ar.updateAssignedResources(
|
||||
Arrays.<Serializable>asList(new GpuDevice(3, 4), new GpuDevice(2, 3)));
|
||||
rmap.addAssignedResources("gpu", ar);
|
||||
when(nmContainer.getResourceMappings()).thenReturn(rmap);
|
||||
|
||||
|
@ -378,8 +442,9 @@ public class TestGpuResourceHandler {
|
|||
deviceAllocationMapping =
|
||||
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
|
||||
Assert.assertEquals(2, deviceAllocationMapping.size());
|
||||
Assert.assertTrue(
|
||||
deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
|
||||
Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
|
||||
Assert.assertTrue(deviceAllocationMapping.keySet()
|
||||
.containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
|
||||
Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
|
||||
getContainerId(1));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -101,23 +101,41 @@ public class TestGpuDiscoverer {
|
|||
GpuDeviceInformation info = plugin.getGpuDeviceInformation();
|
||||
|
||||
Assert.assertTrue(info.getGpus().size() > 0);
|
||||
Assert.assertEquals(plugin.getMinorNumbersOfGpusUsableByYarn().size(),
|
||||
Assert.assertEquals(plugin.getGpusUsableByYarn().size(),
|
||||
info.getGpus().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void getNumberOfUsableGpusFromConfig() throws YarnException {
|
||||
Configuration conf = new Configuration(false);
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,2,4");
|
||||
|
||||
// Illegal format
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3");
|
||||
GpuDiscoverer plugin = new GpuDiscoverer();
|
||||
try {
|
||||
plugin.initialize(conf);
|
||||
plugin.getGpusUsableByYarn();
|
||||
Assert.fail("Illegal format, should fail.");
|
||||
} catch (YarnException e) {
|
||||
// Expected
|
||||
}
|
||||
|
||||
// Valid format
|
||||
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4");
|
||||
plugin = new GpuDiscoverer();
|
||||
plugin.initialize(conf);
|
||||
|
||||
List<Integer> minorNumbers = plugin.getMinorNumbersOfGpusUsableByYarn();
|
||||
Assert.assertEquals(4, minorNumbers.size());
|
||||
List<GpuDevice> usableGpuDevices = plugin.getGpusUsableByYarn();
|
||||
Assert.assertEquals(4, usableGpuDevices.size());
|
||||
|
||||
Assert.assertTrue(0 == minorNumbers.get(0));
|
||||
Assert.assertTrue(1 == minorNumbers.get(1));
|
||||
Assert.assertTrue(2 == minorNumbers.get(2));
|
||||
Assert.assertTrue(4 == minorNumbers.get(3));
|
||||
Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex());
|
||||
Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex());
|
||||
Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex());
|
||||
Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex());
|
||||
|
||||
Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber());
|
||||
Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber());
|
||||
Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber());
|
||||
Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,6 +43,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDelet
|
|||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.records.MasterKey;
|
||||
import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
||||
|
||||
|
||||
|
@ -515,14 +516,17 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void storeAssignedResources(ContainerId containerId,
|
||||
public void storeAssignedResources(Container container,
|
||||
String resourceType, List<Serializable> assignedResources)
|
||||
throws IOException {
|
||||
ResourceMappings.AssignedResources ar =
|
||||
new ResourceMappings.AssignedResources();
|
||||
ar.updateAssignedResources(assignedResources);
|
||||
containerStates.get(containerId).getResourceMappings()
|
||||
containerStates.get(container.getContainerId()).getResourceMappings()
|
||||
.addAssignedResources(resourceType, ar);
|
||||
|
||||
// update container resource mapping.
|
||||
updateContainerResourceMapping(container, resourceType, assignedResources);
|
||||
}
|
||||
|
||||
private static class TrackerState {
|
||||
|
|
|
@ -29,6 +29,7 @@ import static org.mockito.Mockito.isNull;
|
|||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.timeout;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
@ -69,6 +70,8 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDelet
|
|||
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.records.MasterKey;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.amrmproxy.AMRMProxyTokenSecretManager;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.LocalResourceTrackerState;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredApplicationsState;
|
||||
|
@ -1143,17 +1146,22 @@ public class TestNMLeveldbStateStoreService {
|
|||
ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5);
|
||||
storeMockContainer(containerId);
|
||||
|
||||
Container container = mock(Container.class);
|
||||
when(container.getContainerId()).thenReturn(containerId);
|
||||
ResourceMappings resourceMappings = new ResourceMappings();
|
||||
when(container.getResourceMappings()).thenReturn(resourceMappings);
|
||||
|
||||
// Store ResourceMapping
|
||||
stateStore.storeAssignedResources(containerId, "gpu",
|
||||
stateStore.storeAssignedResources(container, "gpu",
|
||||
Arrays.<Serializable>asList("1", "2", "3"));
|
||||
// This will overwrite above
|
||||
List<Serializable> gpuRes1 = Arrays.<Serializable>asList("1", "2", "4");
|
||||
stateStore.storeAssignedResources(containerId, "gpu", gpuRes1);
|
||||
stateStore.storeAssignedResources(container, "gpu", gpuRes1);
|
||||
List<Serializable> fpgaRes =
|
||||
Arrays.<Serializable>asList("3", "4", "5", "6");
|
||||
stateStore.storeAssignedResources(containerId, "fpga", fpgaRes);
|
||||
stateStore.storeAssignedResources(container, "fpga", fpgaRes);
|
||||
List<Serializable> numaRes = Arrays.<Serializable>asList("numa1");
|
||||
stateStore.storeAssignedResources(containerId, "numa", numaRes);
|
||||
stateStore.storeAssignedResources(container, "numa", numaRes);
|
||||
|
||||
// add a invalid key
|
||||
restartStateStore();
|
||||
|
@ -1163,12 +1171,18 @@ public class TestNMLeveldbStateStoreService {
|
|||
List<Serializable> res = rcs.getResourceMappings()
|
||||
.getAssignedResources("gpu");
|
||||
Assert.assertTrue(res.equals(gpuRes1));
|
||||
Assert.assertTrue(
|
||||
resourceMappings.getAssignedResources("gpu").equals(gpuRes1));
|
||||
|
||||
res = rcs.getResourceMappings().getAssignedResources("fpga");
|
||||
Assert.assertTrue(res.equals(fpgaRes));
|
||||
Assert.assertTrue(
|
||||
resourceMappings.getAssignedResources("fpga").equals(fpgaRes));
|
||||
|
||||
res = rcs.getResourceMappings().getAssignedResources("numa");
|
||||
Assert.assertTrue(res.equals(numaRes));
|
||||
Assert.assertTrue(
|
||||
resourceMappings.getAssignedResources("numa").equals(numaRes));
|
||||
}
|
||||
|
||||
private StartContainerRequest storeMockContainer(ContainerId containerId)
|
||||
|
|
Loading…
Reference in New Issue