YARN-9268. General improvements in FpgaDevice. Contributed by Peter Bacsko.

This commit is contained in:
Devaraj K 2019-03-25 13:22:53 -07:00
parent 8739693514
commit eeda6891e4
7 changed files with 90 additions and 133 deletions

View File

@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -51,7 +52,7 @@ public class FpgaResourceAllocator {
//key is resource type of FPGA, vendor plugin supported ID
private LinkedHashMap<String, List<FpgaDevice>> availableFpga = new LinkedHashMap<>();
//key is requetor, aka. container ID
//key is requestor, aka. container ID
private LinkedHashMap<String, List<FpgaDevice>> usedFpgaByRequestor = new LinkedHashMap<>();
private Context nmContext;
@ -133,35 +134,33 @@ public class FpgaResourceAllocator {
}
}
public static class FpgaDevice implements Comparable<FpgaDevice>, Serializable {
/** A class that represents an FPGA card. */
public static class FpgaDevice implements Serializable {
private static final long serialVersionUID = -4678487141824092751L;
private final String type;
private final int major;
private final int minor;
private static final long serialVersionUID = 1L;
private String type;
private Integer major;
private Integer minor;
// IP file identifier. matrix multiplication for instance
private String IPID;
// SHA-256 hash of the uploaded aocx file
private String aocxHash;
// the device name under /dev
private String devName;
// the alias device name. Intel use acl number acl0 to acl31
private String aliasDevName;
// lspci output's bus number: 02:00.00 (bus:slot.func)
private String busNum;
private String temperature;
private String cardPowerUsage;
private final String aliasDevName;
// IP file identifier. matrix multiplication for instance (mutable)
private String IPID;
// SHA-256 hash of the uploaded aocx file (mutable)
private String aocxHash;
// cached hash value
private Integer hashCode;
public String getType() {
return type;
}
public Integer getMajor() {
public int getMajor() {
return major;
}
public Integer getMinor() {
public int getMinor() {
return minor;
}
@ -181,57 +180,16 @@ public class FpgaResourceAllocator {
this.IPID = IPID;
}
public String getDevName() {
return devName;
}
public void setDevName(String devName) {
this.devName = devName;
}
public String getAliasDevName() {
return aliasDevName;
}
public void setAliasDevName(String aliasDevName) {
this.aliasDevName = aliasDevName;
}
public String getBusNum() {
return busNum;
}
public void setBusNum(String busNum) {
this.busNum = busNum;
}
public String getTemperature() {
return temperature;
}
public String getCardPowerUsage() {
return cardPowerUsage;
}
public FpgaDevice(String type, Integer major, Integer minor, String IPID) {
this.type = type;
public FpgaDevice(String type, int major, int minor, String aliasDevName) {
this.type = Preconditions.checkNotNull(type, "type must not be null");
this.major = major;
this.minor = minor;
this.IPID = IPID;
}
public FpgaDevice(String type, Integer major,
Integer minor, String IPID, String devName,
String aliasDevName, String busNum, String temperature, String cardPowerUsage) {
this.type = type;
this.major = major;
this.minor = minor;
this.IPID = IPID;
this.devName = devName;
this.aliasDevName = aliasDevName;
this.busNum = busNum;
this.temperature = temperature;
this.cardPowerUsage = cardPowerUsage;
this.aliasDevName = Preconditions.checkNotNull(aliasDevName,
"aliasDevName must not be null");
}
@Override
@ -242,31 +200,48 @@ public class FpgaResourceAllocator {
if (obj == null) {
return false;
}
if (!(obj instanceof FpgaDevice)) {
if (getClass() != obj.getClass()) {
return false;
}
FpgaDevice other = (FpgaDevice) obj;
if (other.getType().equals(this.type) &&
other.getMajor().equals(this.major) &&
other.getMinor().equals(this.minor)) {
return true;
if (aliasDevName == null) {
if (other.aliasDevName != null) {
return false;
}
} else if (!aliasDevName.equals(other.aliasDevName)) {
return false;
}
return false;
if (major != other.major) {
return false;
}
if (minor != other.minor) {
return false;
}
if (type == null) {
if (other.type != null) {
return false;
}
} else if (!type.equals(other.type)) {
return false;
}
return true;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((type == null) ? 0 : type.hashCode());
result = prime * result + ((major == null) ? 0 : major.hashCode());
result = prime * result + ((minor == null) ? 0 : minor.hashCode());
return result;
}
if (hashCode == null) {
final int prime = 31;
int result = 1;
@Override
public int compareTo(FpgaDevice o) {
return 0;
result = prime * result + major;
result = prime * result + type.hashCode();
result = prime * result + minor;
result = prime * result + aliasDevName.hashCode();
hashCode = result;
}
return hashCode;
}
@Override

View File

@ -149,8 +149,8 @@ final class AoclDiagnosticOutputParser {
devices.add(new FpgaDevice(fpgaType,
Integer.parseInt(mmn[0]),
Integer.parseInt(mmn[1]), null,
fields[0], aliasName, fields[1], fields[2], fields[3]));
Integer.parseInt(mmn[1]),
aliasName));
} else {
LOG.warn("Failed to retrieve major/minor number for device");
}

View File

@ -153,7 +153,7 @@ public class FpgaDiscoverer {
// Replace list with a filtered one
list = list
.stream()
.filter(dev -> minors.contains(dev.getMinor().toString()))
.filter(dev -> minors.contains(String.valueOf(dev.getMinor())))
.collect(Collectors.toList());
// if the count of user configured is still larger than actual

View File

@ -66,12 +66,7 @@ public final class DeviceSpecParser {
fpgaDevices.add(new FpgaDevice(type,
major,
minor,
null,
null,
devName,
null,
null,
null));
devName));
} catch (NumberFormatException e) {
throw new ResourceHandlerException(
"Cannot parse major/minor number: " + deviceSpec);

View File

@ -112,7 +112,7 @@ public class TestFpgaResourceHandler {
// Assumed devices parsed from output
deviceList = new ArrayList<>();
for (int i = 0; i < 5; i++) {
deviceList.add(new FpgaDevice(vendorType, 247, i, null));
deviceList.add(new FpgaDevice(vendorType, 247, i, "acl" + i));
}
String aocxPath = getTestParentFolder() + "/test.aocx";
mockVendorPlugin = mockPlugin(vendorType, deviceList, aocxPath);
@ -163,11 +163,11 @@ public class TestFpgaResourceHandler {
for (String s : allowed.split(",")) {
boolean check = false;
for (FpgaDevice device : allowedDevices) {
if (device.getMinor().toString().equals(s)) {
if (String.valueOf(device.getMinor()).equals(s)) {
check = true;
}
}
Assert.assertTrue("Minor:" + s +"found", check);
Assert.assertTrue("Minor:" + s +" found", check);
}
Assert.assertEquals(3,
fpgaResourceHandler.getFpgaAllocator().getAvailableFpgaCount());
@ -398,10 +398,10 @@ public class TestFpgaResourceHandler {
public void testReacquireContainer() throws ResourceHandlerException {
Container c0 = mockContainer(0, 2, "GEMM");
List<FpgaDevice> assigned = new ArrayList<>();
assigned.add(new
FpgaDevice(vendorType, 247, 0, null));
assigned.add(new
FpgaDevice(vendorType, 247, 1, null));
assigned.add(new FpgaDevice(
vendorType, 247, 0, "acl0"));
assigned.add(new FpgaDevice(
vendorType, 247, 1, "acl1"));
// Mock we've stored the c0 states
mockStateStoreForContainer(c0, assigned);
// NM start
@ -419,10 +419,10 @@ public class TestFpgaResourceHandler {
getUsedFpga().get(getContainerId(0).toString());
int count = 0;
for (FpgaDevice device : used) {
if (device.getMinor().equals(0)){
if (device.getMinor() == 0){
count++;
}
if (device.getMinor().equals(1)) {
if (device.getMinor() == 1) {
count++;
}
}
@ -434,7 +434,7 @@ public class TestFpgaResourceHandler {
.get(vendorType);
count = 0;
for (FpgaDevice device : available) {
if (device.getMinor().equals(2)) {
if (device.getMinor() == 2) {
count++;
}
}
@ -445,8 +445,8 @@ public class TestFpgaResourceHandler {
// Case 2. Recover a not allowed device with minor number 5
Container c1 = mockContainer(1, 1, "GEMM");
assigned = new ArrayList<>();
assigned.add(new
FpgaDevice(vendorType, 247, 5, null));
assigned.add(new FpgaDevice(
vendorType, 247, 5, "acl0"));
// Mock we've stored the c1 states
mockStateStoreForContainer(c1, assigned);
boolean flag = false;
@ -464,8 +464,8 @@ public class TestFpgaResourceHandler {
// Case 3. recover a already used device by other container
Container c2 = mockContainer(2, 1, "GEMM");
assigned = new ArrayList<>();
assigned.add(new
FpgaDevice(vendorType, 247, 1, null));
assigned.add(new FpgaDevice(
vendorType, 247, 1, "acl0"));
// Mock we've stored the c2 states
mockStateStoreForContainer(c2, assigned);
flag = false;
@ -483,8 +483,8 @@ public class TestFpgaResourceHandler {
// Case 4. recover a normal container c3 with remaining minor device number 2
Container c3 = mockContainer(3, 1, "GEMM");
assigned = new ArrayList<>();
assigned.add(new
FpgaDevice(vendorType, 247, 2, null));
assigned.add(new FpgaDevice(
vendorType, 247, 2, "acl2"));
// Mock we've stored the c2 states
mockStateStoreForContainer(c3, assigned);
fpgaResourceHandler.reacquireContainer(getContainerId(3));

View File

@ -83,31 +83,19 @@ public class TestAoclOutputParser {
assertEquals(3, devices.size());
assertEquals("IntelOpenCL", devices.get(0).getType());
assertEquals("247", devices.get(0).getMajor().toString());
assertEquals("0", devices.get(0).getMinor().toString());
assertEquals(247, devices.get(0).getMajor());
assertEquals(0, devices.get(0).getMinor());
assertEquals("acl0", devices.get(0).getAliasDevName());
assertEquals("aclnalla_pcie0", devices.get(0).getDevName());
assertEquals("02:00.00", devices.get(0).getBusNum());
assertEquals("53.1 degrees C", devices.get(0).getTemperature());
assertEquals("31.7 Watts", devices.get(0).getCardPowerUsage());
assertEquals("IntelOpenCL", devices.get(1).getType());
assertEquals("247", devices.get(1).getMajor().toString());
assertEquals("1", devices.get(1).getMinor().toString());
assertEquals(247, devices.get(1).getMajor());
assertEquals(1, devices.get(1).getMinor());
assertEquals("acl1", devices.get(1).getAliasDevName());
assertEquals("aclnalla_pcie1", devices.get(1).getDevName());
assertEquals("03:00.00", devices.get(1).getBusNum());
assertEquals("43.1 degrees C", devices.get(1).getTemperature());
assertEquals("11.7 Watts", devices.get(1).getCardPowerUsage());
assertEquals("IntelOpenCL", devices.get(2).getType());
assertEquals("246", devices.get(2).getMajor().toString());
assertEquals("0", devices.get(2).getMinor().toString());
assertEquals(246, devices.get(2).getMajor());
assertEquals(0, devices.get(2).getMinor());
assertEquals("acl2", devices.get(2).getAliasDevName());
assertEquals("acla10_ref0", devices.get(2).getDevName());
assertEquals("09:00.00", devices.get(2).getBusNum());
assertEquals("50.5781 degrees C", devices.get(2).getTemperature());
assertEquals("", devices.get(2).getCardPowerUsage());
// Case 2. check alias map
assertEquals("acl0", devices.get(0).getAliasDevName());

View File

@ -19,7 +19,6 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga;
import static org.junit.Assert.assertEquals;
import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.ArgumentMatchers.anyString;
@ -175,12 +174,12 @@ public class TestFpgaDiscoverer {
FpgaDevice device1 = devices.get(1);
assertEquals("Device id", "acl0", device0.getAliasDevName());
assertEquals("Minor number", new Integer(0), device0.getMinor());
assertEquals("Major", new Integer(243), device0.getMajor());
assertEquals("Minor number", 0, device0.getMinor());
assertEquals("Major", 243, device0.getMajor());
assertEquals("Device id", "acl1", device1.getAliasDevName());
assertEquals("Minor number", new Integer(1), device1.getMinor());
assertEquals("Major", new Integer(244), device1.getMajor());
assertEquals("Minor number", 1, device1.getMinor());
assertEquals("Major", 244, device1.getMajor());
}
@Test
@ -245,12 +244,12 @@ public class TestFpgaDiscoverer {
FpgaDevice device1 = devices.get(1);
assertEquals("Device id", "acl0", device0.getAliasDevName());
assertEquals("Minor number", new Integer(0), device0.getMinor());
assertEquals("Major", new Integer(243), device0.getMajor());
assertEquals("Minor number", 0, device0.getMinor());
assertEquals("Major", 243, device0.getMajor());
assertEquals("Device id", "acl1", device1.getAliasDevName());
assertEquals("Minor number", new Integer(1), device1.getMinor());
assertEquals("Major", new Integer(244), device1.getMajor());
assertEquals("Minor number", 1, device1.getMinor());
assertEquals("Major", 244, device1.getMajor());
}
@Test