YARN-2531. Added a configuration for admins to be able to override app-configs and enforce/not-enforce strict control of per-container cpu usage. Contributed by Varun Vasudev.
(cherry picked from commit 9f6891d9ef
)
This commit is contained in:
parent
08abb320a7
commit
9d34dc87e1
|
@ -57,6 +57,10 @@ Release 2.6.0 - UNRELEASED
|
|||
failures should be ignored towards counting max-attempts. (Xuan Gong via
|
||||
vinodkv)
|
||||
|
||||
YARN-2531. Added a configuration for admins to be able to override app-configs
|
||||
and enforce/not-enforce strict control of per-container cpu usage. (Varun
|
||||
Vasudev via vinodkv)
|
||||
|
||||
IMPROVEMENTS
|
||||
|
||||
YARN-2242. Improve exception information on AM launch crashes. (Li Lu
|
||||
|
|
|
@ -902,6 +902,16 @@ public class YarnConfiguration extends Configuration {
|
|||
public static final String NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH =
|
||||
NM_PREFIX + "linux-container-executor.cgroups.mount-path";
|
||||
|
||||
/**
|
||||
* Whether the apps should run in strict resource usage mode(not allowed to
|
||||
* use spare CPU)
|
||||
*/
|
||||
public static final String NM_LINUX_CONTAINER_CGROUPS_STRICT_RESOURCE_USAGE =
|
||||
NM_PREFIX + "linux-container-executor.cgroups.strict-resource-usage";
|
||||
public static final boolean DEFAULT_NM_LINUX_CONTAINER_CGROUPS_STRICT_RESOURCE_USAGE =
|
||||
false;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Interval of time the linux container executor should try cleaning up
|
||||
|
|
|
@ -1038,6 +1038,16 @@
|
|||
<value>^[_.A-Za-z0-9][-@_.A-Za-z0-9]{0,255}?[$]?$</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>This flag determines whether apps should run with strict resource limits
|
||||
or be allowed to consume spare resources if they need them. For example, turning the
|
||||
flag on will restrict apps to use only their share of CPU, even if the node has spare
|
||||
CPU cycles. The default value is false i.e. use available resources. Please note that
|
||||
turning this flag on may reduce job throughput on the cluster.</description>
|
||||
<name>yarn.nodemanager.linux-container-executor.cgroups.strict-resource-usage</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>T-file compression types used to compress aggregated logs.</description>
|
||||
<name>yarn.nodemanager.log-aggregation.compression-type</name>
|
||||
|
|
|
@ -57,6 +57,7 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
|||
private String cgroupMountPath;
|
||||
|
||||
private boolean cpuWeightEnabled = true;
|
||||
private boolean strictResourceUsageMode = false;
|
||||
|
||||
private final String MTAB_FILE = "/proc/mounts";
|
||||
private final String CGROUPS_FSTYPE = "cgroup";
|
||||
|
@ -71,6 +72,8 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
|||
private long deleteCgroupTimeout;
|
||||
// package private for testing purposes
|
||||
Clock clock;
|
||||
|
||||
private float yarnProcessors;
|
||||
|
||||
public CgroupsLCEResourcesHandler() {
|
||||
this.controllerPaths = new HashMap<String, String>();
|
||||
|
@ -105,6 +108,12 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
|||
cgroupPrefix = cgroupPrefix.substring(1);
|
||||
}
|
||||
|
||||
this.strictResourceUsageMode =
|
||||
conf
|
||||
.getBoolean(
|
||||
YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_STRICT_RESOURCE_USAGE,
|
||||
YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_STRICT_RESOURCE_USAGE);
|
||||
|
||||
int len = cgroupPrefix.length();
|
||||
if (cgroupPrefix.charAt(len - 1) == '/') {
|
||||
cgroupPrefix = cgroupPrefix.substring(0, len - 1);
|
||||
|
@ -132,8 +141,7 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
|||
initializeControllerPaths();
|
||||
|
||||
// cap overall usage to the number of cores allocated to YARN
|
||||
float yarnProcessors =
|
||||
NodeManagerHardwareUtils.getContainersCores(plugin, conf);
|
||||
yarnProcessors = NodeManagerHardwareUtils.getContainersCores(plugin, conf);
|
||||
int systemProcessors = plugin.getNumProcessors();
|
||||
if (systemProcessors != (int) yarnProcessors) {
|
||||
LOG.info("YARN containers restricted to " + yarnProcessors + " cores");
|
||||
|
@ -290,10 +298,25 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
|||
String containerName = containerId.toString();
|
||||
|
||||
if (isCpuWeightEnabled()) {
|
||||
int containerVCores = containerResource.getVirtualCores();
|
||||
createCgroup(CONTROLLER_CPU, containerName);
|
||||
int cpuShares = CPU_DEFAULT_WEIGHT * containerResource.getVirtualCores();
|
||||
int cpuShares = CPU_DEFAULT_WEIGHT * containerVCores;
|
||||
updateCgroup(CONTROLLER_CPU, containerName, "shares",
|
||||
String.valueOf(cpuShares));
|
||||
if (strictResourceUsageMode) {
|
||||
int nodeVCores =
|
||||
conf.getInt(YarnConfiguration.NM_VCORES,
|
||||
YarnConfiguration.DEFAULT_NM_VCORES);
|
||||
if (nodeVCores != containerVCores) {
|
||||
float containerCPU =
|
||||
(containerVCores * yarnProcessors) / (float) nodeVCores;
|
||||
int[] limits = getOverallLimits(containerCPU);
|
||||
updateCgroup(CONTROLLER_CPU, containerName, CPU_PERIOD_US,
|
||||
String.valueOf(limits[0]));
|
||||
updateCgroup(CONTROLLER_CPU, containerName, CPU_QUOTA_US,
|
||||
String.valueOf(limits[1]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
package org.apache.hadoop.yarn.server.nodemanager.util;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
|
||||
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
|
||||
import org.junit.Assert;
|
||||
|
@ -86,9 +88,13 @@ public class TestCgroupsLCEResourcesHandler {
|
|||
|
||||
String mtabFile;
|
||||
int[] limits = new int[2];
|
||||
boolean generateLimitsMode = false;
|
||||
|
||||
@Override
|
||||
int[] getOverallLimits(float x) {
|
||||
if (generateLimitsMode == true) {
|
||||
return super.getOverallLimits(x);
|
||||
}
|
||||
return limits;
|
||||
}
|
||||
|
||||
|
@ -116,32 +122,11 @@ public class TestCgroupsLCEResourcesHandler {
|
|||
handler.initConfig();
|
||||
|
||||
// create mock cgroup
|
||||
File cgroupDir = new File("target", UUID.randomUUID().toString());
|
||||
if (!cgroupDir.mkdir()) {
|
||||
String message = "Could not create dir " + cgroupDir.getAbsolutePath();
|
||||
throw new IOException(message);
|
||||
}
|
||||
File cgroupMountDir = new File(cgroupDir.getAbsolutePath(), "hadoop-yarn");
|
||||
if (!cgroupMountDir.mkdir()) {
|
||||
String message =
|
||||
"Could not create dir " + cgroupMountDir.getAbsolutePath();
|
||||
throw new IOException(message);
|
||||
}
|
||||
File cgroupDir = createMockCgroup();
|
||||
File cgroupMountDir = createMockCgroupMount(cgroupDir);
|
||||
|
||||
// create mock mtab
|
||||
String mtabContent =
|
||||
"none " + cgroupDir.getAbsolutePath() + " cgroup rw,relatime,cpu 0 0";
|
||||
File mockMtab = new File("target", UUID.randomUUID().toString());
|
||||
if (!mockMtab.exists()) {
|
||||
if (!mockMtab.createNewFile()) {
|
||||
String message = "Could not create file " + mockMtab.getAbsolutePath();
|
||||
throw new IOException(message);
|
||||
}
|
||||
}
|
||||
FileWriter mtabWriter = new FileWriter(mockMtab.getAbsoluteFile());
|
||||
mtabWriter.write(mtabContent);
|
||||
mtabWriter.close();
|
||||
mockMtab.deleteOnExit();
|
||||
File mockMtab = createMockMTab(cgroupDir);
|
||||
|
||||
// setup our handler and call init()
|
||||
handler.setMtabFile(mockMtab.getAbsolutePath());
|
||||
|
@ -156,7 +141,8 @@ public class TestCgroupsLCEResourcesHandler {
|
|||
Assert.assertFalse(quotaFile.exists());
|
||||
|
||||
// subset of cpu being used, files should be created
|
||||
conf.setInt(YarnConfiguration.NM_RESOURCE_PERCENTAGE_PHYSICAL_CPU_LIMIT, 75);
|
||||
conf
|
||||
.setInt(YarnConfiguration.NM_RESOURCE_PERCENTAGE_PHYSICAL_CPU_LIMIT, 75);
|
||||
handler.limits[0] = 100 * 1000;
|
||||
handler.limits[1] = 1000 * 1000;
|
||||
handler.init(mockLCE, plugin);
|
||||
|
@ -166,7 +152,8 @@ public class TestCgroupsLCEResourcesHandler {
|
|||
Assert.assertEquals(1000 * 1000, quota);
|
||||
|
||||
// set cpu back to 100, quota should be -1
|
||||
conf.setInt(YarnConfiguration.NM_RESOURCE_PERCENTAGE_PHYSICAL_CPU_LIMIT, 100);
|
||||
conf.setInt(YarnConfiguration.NM_RESOURCE_PERCENTAGE_PHYSICAL_CPU_LIMIT,
|
||||
100);
|
||||
handler.limits[0] = 100 * 1000;
|
||||
handler.limits[1] = 1000 * 1000;
|
||||
handler.init(mockLCE, plugin);
|
||||
|
@ -213,4 +200,130 @@ public class TestCgroupsLCEResourcesHandler {
|
|||
Assert.assertEquals(1000 * 1000, ret[0]);
|
||||
Assert.assertEquals(-1, ret[1]);
|
||||
}
|
||||
|
||||
private File createMockCgroup() throws IOException {
|
||||
File cgroupDir = new File("target", UUID.randomUUID().toString());
|
||||
if (!cgroupDir.mkdir()) {
|
||||
String message = "Could not create dir " + cgroupDir.getAbsolutePath();
|
||||
throw new IOException(message);
|
||||
}
|
||||
return cgroupDir;
|
||||
}
|
||||
|
||||
private File createMockCgroupMount(File cgroupDir) throws IOException {
|
||||
File cgroupMountDir = new File(cgroupDir.getAbsolutePath(), "hadoop-yarn");
|
||||
if (!cgroupMountDir.mkdir()) {
|
||||
String message =
|
||||
"Could not create dir " + cgroupMountDir.getAbsolutePath();
|
||||
throw new IOException(message);
|
||||
}
|
||||
return cgroupMountDir;
|
||||
}
|
||||
|
||||
private File createMockMTab(File cgroupDir) throws IOException {
|
||||
String mtabContent =
|
||||
"none " + cgroupDir.getAbsolutePath() + " cgroup rw,relatime,cpu 0 0";
|
||||
File mockMtab = new File("target", UUID.randomUUID().toString());
|
||||
if (!mockMtab.exists()) {
|
||||
if (!mockMtab.createNewFile()) {
|
||||
String message = "Could not create file " + mockMtab.getAbsolutePath();
|
||||
throw new IOException(message);
|
||||
}
|
||||
}
|
||||
FileWriter mtabWriter = new FileWriter(mockMtab.getAbsoluteFile());
|
||||
mtabWriter.write(mtabContent);
|
||||
mtabWriter.close();
|
||||
mockMtab.deleteOnExit();
|
||||
return mockMtab;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testContainerLimits() throws IOException {
|
||||
LinuxContainerExecutor mockLCE = new MockLinuxContainerExecutor();
|
||||
CustomCgroupsLCEResourceHandler handler =
|
||||
new CustomCgroupsLCEResourceHandler();
|
||||
handler.generateLimitsMode = true;
|
||||
YarnConfiguration conf = new YarnConfiguration();
|
||||
final int numProcessors = 4;
|
||||
ResourceCalculatorPlugin plugin =
|
||||
Mockito.mock(ResourceCalculatorPlugin.class);
|
||||
Mockito.doReturn(numProcessors).when(plugin).getNumProcessors();
|
||||
handler.setConf(conf);
|
||||
handler.initConfig();
|
||||
|
||||
// create mock cgroup
|
||||
File cgroupDir = createMockCgroup();
|
||||
File cgroupMountDir = createMockCgroupMount(cgroupDir);
|
||||
|
||||
// create mock mtab
|
||||
File mockMtab = createMockMTab(cgroupDir);
|
||||
|
||||
// setup our handler and call init()
|
||||
handler.setMtabFile(mockMtab.getAbsolutePath());
|
||||
handler.init(mockLCE, plugin);
|
||||
|
||||
// check values
|
||||
// default case - files shouldn't exist, strict mode off by default
|
||||
ContainerId id = ContainerId.fromString("container_1_1_1_1");
|
||||
handler.preExecute(id, Resource.newInstance(1024, 1));
|
||||
File containerDir = new File(cgroupMountDir, id.toString());
|
||||
Assert.assertTrue(containerDir.exists());
|
||||
Assert.assertTrue(containerDir.isDirectory());
|
||||
File periodFile = new File(containerDir, "cpu.cfs_period_us");
|
||||
File quotaFile = new File(containerDir, "cpu.cfs_quota_us");
|
||||
Assert.assertFalse(periodFile.exists());
|
||||
Assert.assertFalse(quotaFile.exists());
|
||||
|
||||
// no files created because we're using all cpu
|
||||
FileUtils.deleteQuietly(containerDir);
|
||||
conf.setBoolean(
|
||||
YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_STRICT_RESOURCE_USAGE, true);
|
||||
handler.initConfig();
|
||||
handler.preExecute(id,
|
||||
Resource.newInstance(1024, YarnConfiguration.DEFAULT_NM_VCORES));
|
||||
Assert.assertTrue(containerDir.exists());
|
||||
Assert.assertTrue(containerDir.isDirectory());
|
||||
periodFile = new File(containerDir, "cpu.cfs_period_us");
|
||||
quotaFile = new File(containerDir, "cpu.cfs_quota_us");
|
||||
Assert.assertFalse(periodFile.exists());
|
||||
Assert.assertFalse(quotaFile.exists());
|
||||
|
||||
// 50% of CPU
|
||||
FileUtils.deleteQuietly(containerDir);
|
||||
conf.setBoolean(
|
||||
YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_STRICT_RESOURCE_USAGE, true);
|
||||
handler.initConfig();
|
||||
handler.preExecute(id,
|
||||
Resource.newInstance(1024, YarnConfiguration.DEFAULT_NM_VCORES / 2));
|
||||
Assert.assertTrue(containerDir.exists());
|
||||
Assert.assertTrue(containerDir.isDirectory());
|
||||
periodFile = new File(containerDir, "cpu.cfs_period_us");
|
||||
quotaFile = new File(containerDir, "cpu.cfs_quota_us");
|
||||
Assert.assertTrue(periodFile.exists());
|
||||
Assert.assertTrue(quotaFile.exists());
|
||||
Assert.assertEquals(500 * 1000, readIntFromFile(periodFile));
|
||||
Assert.assertEquals(1000 * 1000, readIntFromFile(quotaFile));
|
||||
|
||||
// CGroups set to 50% of CPU, container set to 50% of YARN CPU
|
||||
FileUtils.deleteQuietly(containerDir);
|
||||
conf.setBoolean(
|
||||
YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_STRICT_RESOURCE_USAGE, true);
|
||||
conf
|
||||
.setInt(YarnConfiguration.NM_RESOURCE_PERCENTAGE_PHYSICAL_CPU_LIMIT, 50);
|
||||
handler.initConfig();
|
||||
handler.init(mockLCE, plugin);
|
||||
handler.preExecute(id,
|
||||
Resource.newInstance(1024, YarnConfiguration.DEFAULT_NM_VCORES / 2));
|
||||
Assert.assertTrue(containerDir.exists());
|
||||
Assert.assertTrue(containerDir.isDirectory());
|
||||
periodFile = new File(containerDir, "cpu.cfs_period_us");
|
||||
quotaFile = new File(containerDir, "cpu.cfs_quota_us");
|
||||
Assert.assertTrue(periodFile.exists());
|
||||
Assert.assertTrue(quotaFile.exists());
|
||||
Assert.assertEquals(1000 * 1000, readIntFromFile(periodFile));
|
||||
Assert.assertEquals(1000 * 1000, readIntFromFile(quotaFile));
|
||||
|
||||
FileUtils.deleteQuietly(cgroupDir);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue