YARN-8930. CGroup-based strict container memory enforcement does not work with CGroupElasticMemoryController (haibochen via rkanter)
This commit is contained in:
parent
fb2b72e6fc
commit
f76e3c3db7
|
@ -34,9 +34,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileg
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handler class to handle the memory controller. YARN already ships a
|
* Handler class to handle the memory controller. YARN already ships a
|
||||||
|
@ -174,26 +171,4 @@ public class CGroupsMemoryResourceHandlerImpl implements MemoryResourceHandler {
|
||||||
public List<PrivilegedOperation> teardown() throws ResourceHandlerException {
|
public List<PrivilegedOperation> teardown() throws ResourceHandlerException {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Optional<Boolean> isUnderOOM(ContainerId containerId) {
|
|
||||||
try {
|
|
||||||
String status = cGroupsHandler.getCGroupParam(
|
|
||||||
CGroupsHandler.CGroupController.MEMORY,
|
|
||||||
containerId.toString(),
|
|
||||||
CGROUP_PARAM_MEMORY_OOM_CONTROL);
|
|
||||||
if (LOG.isDebugEnabled()) {
|
|
||||||
LOG.debug("cgroups OOM status for " + containerId + ": " + status);
|
|
||||||
}
|
|
||||||
if (status.contains(CGroupsHandler.UNDER_OOM)) {
|
|
||||||
LOG.warn("Container " + containerId + " under OOM based on cgroups.");
|
|
||||||
return Optional.of(true);
|
|
||||||
} else {
|
|
||||||
return Optional.of(false);
|
|
||||||
}
|
|
||||||
} catch (ResourceHandlerException e) {
|
|
||||||
LOG.warn("Could not read cgroups" + containerId, e);
|
|
||||||
}
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,18 +20,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
|
||||||
|
|
||||||
import org.apache.hadoop.classification.InterfaceAudience;
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.classification.InterfaceStability;
|
import org.apache.hadoop.classification.InterfaceStability;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
|
||||||
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
@InterfaceAudience.Private
|
@InterfaceAudience.Private
|
||||||
@InterfaceStability.Unstable
|
@InterfaceStability.Unstable
|
||||||
public interface MemoryResourceHandler extends ResourceHandler {
|
public interface MemoryResourceHandler extends ResourceHandler {
|
||||||
/**
|
|
||||||
* check whether a container is under OOM.
|
|
||||||
* @param containerId the id of the container
|
|
||||||
* @return empty if the status is unknown, true is the container is under oom,
|
|
||||||
* false otherwise
|
|
||||||
*/
|
|
||||||
Optional<Boolean> isUnderOOM(ContainerId containerId);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,6 @@ import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupElasticMemoryController;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupElasticMemoryController;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.MemoryResourceHandler;
|
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerModule;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerModule;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -52,7 +51,6 @@ import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Map.Entry;
|
import java.util.Map.Entry;
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -699,75 +697,61 @@ public class ContainersMonitorImpl extends AbstractService implements
|
||||||
ProcessTreeInfo ptInfo,
|
ProcessTreeInfo ptInfo,
|
||||||
long currentVmemUsage,
|
long currentVmemUsage,
|
||||||
long currentPmemUsage) {
|
long currentPmemUsage) {
|
||||||
Optional<Boolean> isMemoryOverLimit = Optional.empty();
|
if (strictMemoryEnforcement && !elasticMemoryEnforcement) {
|
||||||
|
// When cgroup-based strict memory enforcement is used alone without
|
||||||
|
// elastic memory control, the oom-kill would take care of it.
|
||||||
|
// However, when elastic memory control is also enabled, the oom killer
|
||||||
|
// would be disabled at the root yarn container cgroup level (all child
|
||||||
|
// cgroups would inherit that setting). Hence, we fall back to the
|
||||||
|
// polling-based mechanism.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
boolean isMemoryOverLimit = false;
|
||||||
String msg = "";
|
String msg = "";
|
||||||
int containerExitStatus = ContainerExitStatus.INVALID;
|
int containerExitStatus = ContainerExitStatus.INVALID;
|
||||||
|
|
||||||
if (strictMemoryEnforcement && elasticMemoryEnforcement) {
|
long vmemLimit = ptInfo.getVmemLimit();
|
||||||
// Both elastic memory control and strict memory control are enabled
|
long pmemLimit = ptInfo.getPmemLimit();
|
||||||
// through cgroups. A container will be frozen by the elastic memory
|
// as processes begin with an age 1, we want to see if there
|
||||||
// control mechanism if it exceeds its request, so we check for this
|
// are processes more than 1 iteration old.
|
||||||
// here and kill it. Otherwise, the container will not be killed if
|
long curMemUsageOfAgedProcesses = pTree.getVirtualMemorySize(1);
|
||||||
// the node never exceeds its limit and the procfs-based
|
long curRssMemUsageOfAgedProcesses = pTree.getRssMemorySize(1);
|
||||||
// memory accounting is different from the cgroup-based accounting.
|
if (isVmemCheckEnabled()
|
||||||
|
&& isProcessTreeOverLimit(containerId.toString(),
|
||||||
MemoryResourceHandler handler =
|
currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {
|
||||||
ResourceHandlerModule.getMemoryResourceHandler();
|
// The current usage (age=0) is always higher than the aged usage. We
|
||||||
if (handler != null) {
|
// do not show the aged size in the message, base the delta on the
|
||||||
isMemoryOverLimit = handler.isUnderOOM(containerId);
|
// current usage
|
||||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
|
long delta = currentVmemUsage - vmemLimit;
|
||||||
msg = containerId + " is under oom because it exceeded its" +
|
// Container (the root process) is still alive and overflowing
|
||||||
" physical memory limit";
|
// memory.
|
||||||
}
|
// Dump the process-tree and then clean it up.
|
||||||
} else if (strictMemoryEnforcement || elasticMemoryEnforcement) {
|
msg = formatErrorMessage("virtual",
|
||||||
// if cgroup-based memory control is enabled
|
formatUsageString(currentVmemUsage, vmemLimit,
|
||||||
isMemoryOverLimit = Optional.of(false);
|
currentPmemUsage, pmemLimit),
|
||||||
|
pId, containerId, pTree, delta);
|
||||||
|
isMemoryOverLimit = true;
|
||||||
|
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_VMEM;
|
||||||
|
} else if (isPmemCheckEnabled()
|
||||||
|
&& isProcessTreeOverLimit(containerId.toString(),
|
||||||
|
currentPmemUsage, curRssMemUsageOfAgedProcesses,
|
||||||
|
pmemLimit)) {
|
||||||
|
// The current usage (age=0) is always higher than the aged usage. We
|
||||||
|
// do not show the aged size in the message, base the delta on the
|
||||||
|
// current usage
|
||||||
|
long delta = currentPmemUsage - pmemLimit;
|
||||||
|
// Container (the root process) is still alive and overflowing
|
||||||
|
// memory.
|
||||||
|
// Dump the process-tree and then clean it up.
|
||||||
|
msg = formatErrorMessage("physical",
|
||||||
|
formatUsageString(currentVmemUsage, vmemLimit,
|
||||||
|
currentPmemUsage, pmemLimit),
|
||||||
|
pId, containerId, pTree, delta);
|
||||||
|
isMemoryOverLimit = true;
|
||||||
|
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!isMemoryOverLimit.isPresent()) {
|
if (isMemoryOverLimit) {
|
||||||
long vmemLimit = ptInfo.getVmemLimit();
|
|
||||||
long pmemLimit = ptInfo.getPmemLimit();
|
|
||||||
// as processes begin with an age 1, we want to see if there
|
|
||||||
// are processes more than 1 iteration old.
|
|
||||||
long curMemUsageOfAgedProcesses = pTree.getVirtualMemorySize(1);
|
|
||||||
long curRssMemUsageOfAgedProcesses = pTree.getRssMemorySize(1);
|
|
||||||
if (isVmemCheckEnabled()
|
|
||||||
&& isProcessTreeOverLimit(containerId.toString(),
|
|
||||||
currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {
|
|
||||||
// The current usage (age=0) is always higher than the aged usage. We
|
|
||||||
// do not show the aged size in the message, base the delta on the
|
|
||||||
// current usage
|
|
||||||
long delta = currentVmemUsage - vmemLimit;
|
|
||||||
// Container (the root process) is still alive and overflowing
|
|
||||||
// memory.
|
|
||||||
// Dump the process-tree and then clean it up.
|
|
||||||
msg = formatErrorMessage("virtual",
|
|
||||||
formatUsageString(currentVmemUsage, vmemLimit,
|
|
||||||
currentPmemUsage, pmemLimit),
|
|
||||||
pId, containerId, pTree, delta);
|
|
||||||
isMemoryOverLimit = Optional.of(true);
|
|
||||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_VMEM;
|
|
||||||
} else if (isPmemCheckEnabled()
|
|
||||||
&& isProcessTreeOverLimit(containerId.toString(),
|
|
||||||
currentPmemUsage, curRssMemUsageOfAgedProcesses,
|
|
||||||
pmemLimit)) {
|
|
||||||
// The current usage (age=0) is always higher than the aged usage. We
|
|
||||||
// do not show the aged size in the message, base the delta on the
|
|
||||||
// current usage
|
|
||||||
long delta = currentPmemUsage - pmemLimit;
|
|
||||||
// Container (the root process) is still alive and overflowing
|
|
||||||
// memory.
|
|
||||||
// Dump the process-tree and then clean it up.
|
|
||||||
msg = formatErrorMessage("physical",
|
|
||||||
formatUsageString(currentVmemUsage, vmemLimit,
|
|
||||||
currentPmemUsage, pmemLimit),
|
|
||||||
pId, containerId, pTree, delta);
|
|
||||||
isMemoryOverLimit = Optional.of(true);
|
|
||||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isMemoryOverLimit.isPresent() && isMemoryOverLimit.get()) {
|
|
||||||
// Virtual or physical memory over limit. Fail the container and
|
// Virtual or physical memory over limit. Fail the container and
|
||||||
// remove
|
// remove
|
||||||
// the corresponding process tree
|
// the corresponding process tree
|
||||||
|
|
|
@ -31,9 +31,6 @@ import org.junit.Test;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL;
|
|
||||||
import static org.mockito.Mockito.*;
|
import static org.mockito.Mockito.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -244,45 +241,4 @@ public class TestCGroupsMemoryResourceHandlerImpl {
|
||||||
.updateCGroupParam(CGroupsHandler.CGroupController.MEMORY, id,
|
.updateCGroupParam(CGroupsHandler.CGroupController.MEMORY, id,
|
||||||
CGroupsHandler.CGROUP_PARAM_MEMORY_HARD_LIMIT_BYTES, "1024M");
|
CGroupsHandler.CGROUP_PARAM_MEMORY_HARD_LIMIT_BYTES, "1024M");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testContainerUnderOom() throws Exception {
|
|
||||||
Configuration conf = new YarnConfiguration();
|
|
||||||
conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
|
|
||||||
conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
|
|
||||||
|
|
||||||
cGroupsMemoryResourceHandler.bootstrap(conf);
|
|
||||||
|
|
||||||
ContainerId containerId = mock(ContainerId.class);
|
|
||||||
when(containerId.toString()).thenReturn("container_01_01");
|
|
||||||
|
|
||||||
when(mockCGroupsHandler.getCGroupParam(
|
|
||||||
CGroupsHandler.CGroupController.MEMORY,
|
|
||||||
containerId.toString(),
|
|
||||||
CGROUP_PARAM_MEMORY_OOM_CONTROL)).thenReturn(CGroupsHandler.UNDER_OOM);
|
|
||||||
Optional<Boolean> outOfOom =
|
|
||||||
cGroupsMemoryResourceHandler.isUnderOOM(containerId);
|
|
||||||
Assert.assertTrue("The container should be reported to run under oom",
|
|
||||||
outOfOom.isPresent() && outOfOom.get().equals(true));
|
|
||||||
|
|
||||||
when(mockCGroupsHandler.getCGroupParam(
|
|
||||||
CGroupsHandler.CGroupController.MEMORY,
|
|
||||||
containerId.toString(),
|
|
||||||
CGROUP_PARAM_MEMORY_OOM_CONTROL)).thenReturn("");
|
|
||||||
outOfOom = cGroupsMemoryResourceHandler.isUnderOOM(containerId);
|
|
||||||
Assert.assertTrue(
|
|
||||||
"The container should not be reported to run under oom",
|
|
||||||
outOfOom.isPresent() && outOfOom.get().equals(false));
|
|
||||||
|
|
||||||
when(mockCGroupsHandler.getCGroupParam(
|
|
||||||
CGroupsHandler.CGroupController.MEMORY,
|
|
||||||
containerId.toString(),
|
|
||||||
CGROUP_PARAM_MEMORY_OOM_CONTROL)).
|
|
||||||
thenThrow(new ResourceHandlerException());
|
|
||||||
outOfOom = cGroupsMemoryResourceHandler.isUnderOOM(containerId);
|
|
||||||
Assert.assertFalse(
|
|
||||||
"No report of the oom status should be available.",
|
|
||||||
outOfOom.isPresent());
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,8 +20,6 @@ YARN has multiple features to enforce container memory limits. There are three t
|
||||||
2. Strict memory control kills each container that has exceeded its limits. It is using the OOM killer capability of the cgroups Linux kernel feature.
|
2. Strict memory control kills each container that has exceeded its limits. It is using the OOM killer capability of the cgroups Linux kernel feature.
|
||||||
3. Elastic memory control is also based on cgroups. It allows bursting and starts killing containers only, if the overall system memory usage reaches a limit.
|
3. Elastic memory control is also based on cgroups. It allows bursting and starts killing containers only, if the overall system memory usage reaches a limit.
|
||||||
|
|
||||||
If you use 2. or 3. feature 1. is disabled.
|
|
||||||
|
|
||||||
Strict Memory Feature
|
Strict Memory Feature
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -131,3 +129,13 @@ Configure the cgroups prerequisites mentioned above.
|
||||||
`yarn.nodemanager.resource.memory.enforced` should be `false`
|
`yarn.nodemanager.resource.memory.enforced` should be `false`
|
||||||
|
|
||||||
`yarn.nodemanager.pmem-check-enabled` or `yarn.nodemanager.vmem-check-enabled` should be `true`. If swapping is turned off the former should be set, the latter should be set otherwise.
|
`yarn.nodemanager.pmem-check-enabled` or `yarn.nodemanager.vmem-check-enabled` should be `true`. If swapping is turned off the former should be set, the latter should be set otherwise.
|
||||||
|
|
||||||
|
|
||||||
|
Configuring elastic memory control and strict container memory enforcement through cgroups
|
||||||
|
------------------------------------------
|
||||||
|
ADVANCED ONLY
|
||||||
|
Elastic memory control and strict container memory enforcement can be enabled at the same time to allow Node Manager to over-allocate itself.
|
||||||
|
However, elastic memory control changes how strict container memory enforcement through cgroups is performed. Elastic memory control
|
||||||
|
disables the oom killer on the root yarn container cgroup. The oom killer setting overrides that of individual container cgroups, so individual
|
||||||
|
containers won't be killed by the oom killer when they go over their memory limit. The strict container memory enforcement in this case falls
|
||||||
|
back to the polling-based mechanism.
|
||||||
|
|
Loading…
Reference in New Issue