YARN-8930. CGroup-based strict container memory enforcement does not work with CGroupElasticMemoryController (haibochen via rkanter)
This commit is contained in:
parent
fb2b72e6fc
commit
f76e3c3db7
|
@ -34,9 +34,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileg
|
|||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL;
|
||||
|
||||
/**
|
||||
* Handler class to handle the memory controller. YARN already ships a
|
||||
|
@ -174,26 +171,4 @@ public class CGroupsMemoryResourceHandlerImpl implements MemoryResourceHandler {
|
|||
public List<PrivilegedOperation> teardown() throws ResourceHandlerException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Boolean> isUnderOOM(ContainerId containerId) {
|
||||
try {
|
||||
String status = cGroupsHandler.getCGroupParam(
|
||||
CGroupsHandler.CGroupController.MEMORY,
|
||||
containerId.toString(),
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL);
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("cgroups OOM status for " + containerId + ": " + status);
|
||||
}
|
||||
if (status.contains(CGroupsHandler.UNDER_OOM)) {
|
||||
LOG.warn("Container " + containerId + " under OOM based on cgroups.");
|
||||
return Optional.of(true);
|
||||
} else {
|
||||
return Optional.of(false);
|
||||
}
|
||||
} catch (ResourceHandlerException e) {
|
||||
LOG.warn("Could not read cgroups" + containerId, e);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,18 +20,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
|
|||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
public interface MemoryResourceHandler extends ResourceHandler {
|
||||
/**
|
||||
* check whether a container is under OOM.
|
||||
* @param containerId the id of the container
|
||||
* @return empty if the status is unknown, true is the container is under oom,
|
||||
* false otherwise
|
||||
*/
|
||||
Optional<Boolean> isUnderOOM(ContainerId containerId);
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ import com.google.common.annotations.VisibleForTesting;
|
|||
import com.google.common.base.Preconditions;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupElasticMemoryController;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.MemoryResourceHandler;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerModule;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -52,7 +51,6 @@ import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
|||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
|
@ -699,75 +697,61 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|||
ProcessTreeInfo ptInfo,
|
||||
long currentVmemUsage,
|
||||
long currentPmemUsage) {
|
||||
Optional<Boolean> isMemoryOverLimit = Optional.empty();
|
||||
if (strictMemoryEnforcement && !elasticMemoryEnforcement) {
|
||||
// When cgroup-based strict memory enforcement is used alone without
|
||||
// elastic memory control, the oom-kill would take care of it.
|
||||
// However, when elastic memory control is also enabled, the oom killer
|
||||
// would be disabled at the root yarn container cgroup level (all child
|
||||
// cgroups would inherit that setting). Hence, we fall back to the
|
||||
// polling-based mechanism.
|
||||
return;
|
||||
}
|
||||
boolean isMemoryOverLimit = false;
|
||||
String msg = "";
|
||||
int containerExitStatus = ContainerExitStatus.INVALID;
|
||||
|
||||
if (strictMemoryEnforcement && elasticMemoryEnforcement) {
|
||||
// Both elastic memory control and strict memory control are enabled
|
||||
// through cgroups. A container will be frozen by the elastic memory
|
||||
// control mechanism if it exceeds its request, so we check for this
|
||||
// here and kill it. Otherwise, the container will not be killed if
|
||||
// the node never exceeds its limit and the procfs-based
|
||||
// memory accounting is different from the cgroup-based accounting.
|
||||
|
||||
MemoryResourceHandler handler =
|
||||
ResourceHandlerModule.getMemoryResourceHandler();
|
||||
if (handler != null) {
|
||||
isMemoryOverLimit = handler.isUnderOOM(containerId);
|
||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
|
||||
msg = containerId + " is under oom because it exceeded its" +
|
||||
" physical memory limit";
|
||||
}
|
||||
} else if (strictMemoryEnforcement || elasticMemoryEnforcement) {
|
||||
// if cgroup-based memory control is enabled
|
||||
isMemoryOverLimit = Optional.of(false);
|
||||
long vmemLimit = ptInfo.getVmemLimit();
|
||||
long pmemLimit = ptInfo.getPmemLimit();
|
||||
// as processes begin with an age 1, we want to see if there
|
||||
// are processes more than 1 iteration old.
|
||||
long curMemUsageOfAgedProcesses = pTree.getVirtualMemorySize(1);
|
||||
long curRssMemUsageOfAgedProcesses = pTree.getRssMemorySize(1);
|
||||
if (isVmemCheckEnabled()
|
||||
&& isProcessTreeOverLimit(containerId.toString(),
|
||||
currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {
|
||||
// The current usage (age=0) is always higher than the aged usage. We
|
||||
// do not show the aged size in the message, base the delta on the
|
||||
// current usage
|
||||
long delta = currentVmemUsage - vmemLimit;
|
||||
// Container (the root process) is still alive and overflowing
|
||||
// memory.
|
||||
// Dump the process-tree and then clean it up.
|
||||
msg = formatErrorMessage("virtual",
|
||||
formatUsageString(currentVmemUsage, vmemLimit,
|
||||
currentPmemUsage, pmemLimit),
|
||||
pId, containerId, pTree, delta);
|
||||
isMemoryOverLimit = true;
|
||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_VMEM;
|
||||
} else if (isPmemCheckEnabled()
|
||||
&& isProcessTreeOverLimit(containerId.toString(),
|
||||
currentPmemUsage, curRssMemUsageOfAgedProcesses,
|
||||
pmemLimit)) {
|
||||
// The current usage (age=0) is always higher than the aged usage. We
|
||||
// do not show the aged size in the message, base the delta on the
|
||||
// current usage
|
||||
long delta = currentPmemUsage - pmemLimit;
|
||||
// Container (the root process) is still alive and overflowing
|
||||
// memory.
|
||||
// Dump the process-tree and then clean it up.
|
||||
msg = formatErrorMessage("physical",
|
||||
formatUsageString(currentVmemUsage, vmemLimit,
|
||||
currentPmemUsage, pmemLimit),
|
||||
pId, containerId, pTree, delta);
|
||||
isMemoryOverLimit = true;
|
||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
|
||||
}
|
||||
|
||||
if (!isMemoryOverLimit.isPresent()) {
|
||||
long vmemLimit = ptInfo.getVmemLimit();
|
||||
long pmemLimit = ptInfo.getPmemLimit();
|
||||
// as processes begin with an age 1, we want to see if there
|
||||
// are processes more than 1 iteration old.
|
||||
long curMemUsageOfAgedProcesses = pTree.getVirtualMemorySize(1);
|
||||
long curRssMemUsageOfAgedProcesses = pTree.getRssMemorySize(1);
|
||||
if (isVmemCheckEnabled()
|
||||
&& isProcessTreeOverLimit(containerId.toString(),
|
||||
currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {
|
||||
// The current usage (age=0) is always higher than the aged usage. We
|
||||
// do not show the aged size in the message, base the delta on the
|
||||
// current usage
|
||||
long delta = currentVmemUsage - vmemLimit;
|
||||
// Container (the root process) is still alive and overflowing
|
||||
// memory.
|
||||
// Dump the process-tree and then clean it up.
|
||||
msg = formatErrorMessage("virtual",
|
||||
formatUsageString(currentVmemUsage, vmemLimit,
|
||||
currentPmemUsage, pmemLimit),
|
||||
pId, containerId, pTree, delta);
|
||||
isMemoryOverLimit = Optional.of(true);
|
||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_VMEM;
|
||||
} else if (isPmemCheckEnabled()
|
||||
&& isProcessTreeOverLimit(containerId.toString(),
|
||||
currentPmemUsage, curRssMemUsageOfAgedProcesses,
|
||||
pmemLimit)) {
|
||||
// The current usage (age=0) is always higher than the aged usage. We
|
||||
// do not show the aged size in the message, base the delta on the
|
||||
// current usage
|
||||
long delta = currentPmemUsage - pmemLimit;
|
||||
// Container (the root process) is still alive and overflowing
|
||||
// memory.
|
||||
// Dump the process-tree and then clean it up.
|
||||
msg = formatErrorMessage("physical",
|
||||
formatUsageString(currentVmemUsage, vmemLimit,
|
||||
currentPmemUsage, pmemLimit),
|
||||
pId, containerId, pTree, delta);
|
||||
isMemoryOverLimit = Optional.of(true);
|
||||
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
|
||||
}
|
||||
}
|
||||
|
||||
if (isMemoryOverLimit.isPresent() && isMemoryOverLimit.get()) {
|
||||
if (isMemoryOverLimit) {
|
||||
// Virtual or physical memory over limit. Fail the container and
|
||||
// remove
|
||||
// the corresponding process tree
|
||||
|
|
|
@ -31,9 +31,6 @@ import org.junit.Test;
|
|||
import org.junit.Assert;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL;
|
||||
import static org.mockito.Mockito.*;
|
||||
|
||||
/**
|
||||
|
@ -244,45 +241,4 @@ public class TestCGroupsMemoryResourceHandlerImpl {
|
|||
.updateCGroupParam(CGroupsHandler.CGroupController.MEMORY, id,
|
||||
CGroupsHandler.CGROUP_PARAM_MEMORY_HARD_LIMIT_BYTES, "1024M");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testContainerUnderOom() throws Exception {
|
||||
Configuration conf = new YarnConfiguration();
|
||||
conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
|
||||
conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);
|
||||
|
||||
cGroupsMemoryResourceHandler.bootstrap(conf);
|
||||
|
||||
ContainerId containerId = mock(ContainerId.class);
|
||||
when(containerId.toString()).thenReturn("container_01_01");
|
||||
|
||||
when(mockCGroupsHandler.getCGroupParam(
|
||||
CGroupsHandler.CGroupController.MEMORY,
|
||||
containerId.toString(),
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL)).thenReturn(CGroupsHandler.UNDER_OOM);
|
||||
Optional<Boolean> outOfOom =
|
||||
cGroupsMemoryResourceHandler.isUnderOOM(containerId);
|
||||
Assert.assertTrue("The container should be reported to run under oom",
|
||||
outOfOom.isPresent() && outOfOom.get().equals(true));
|
||||
|
||||
when(mockCGroupsHandler.getCGroupParam(
|
||||
CGroupsHandler.CGroupController.MEMORY,
|
||||
containerId.toString(),
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL)).thenReturn("");
|
||||
outOfOom = cGroupsMemoryResourceHandler.isUnderOOM(containerId);
|
||||
Assert.assertTrue(
|
||||
"The container should not be reported to run under oom",
|
||||
outOfOom.isPresent() && outOfOom.get().equals(false));
|
||||
|
||||
when(mockCGroupsHandler.getCGroupParam(
|
||||
CGroupsHandler.CGroupController.MEMORY,
|
||||
containerId.toString(),
|
||||
CGROUP_PARAM_MEMORY_OOM_CONTROL)).
|
||||
thenThrow(new ResourceHandlerException());
|
||||
outOfOom = cGroupsMemoryResourceHandler.isUnderOOM(containerId);
|
||||
Assert.assertFalse(
|
||||
"No report of the oom status should be available.",
|
||||
outOfOom.isPresent());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ YARN has multiple features to enforce container memory limits. There are three t
|
|||
2. Strict memory control kills each container that has exceeded its limits. It is using the OOM killer capability of the cgroups Linux kernel feature.
|
||||
3. Elastic memory control is also based on cgroups. It allows bursting and starts killing containers only, if the overall system memory usage reaches a limit.
|
||||
|
||||
If you use 2. or 3. feature 1. is disabled.
|
||||
|
||||
Strict Memory Feature
|
||||
---------------------
|
||||
|
||||
|
@ -131,3 +129,13 @@ Configure the cgroups prerequisites mentioned above.
|
|||
`yarn.nodemanager.resource.memory.enforced` should be `false`
|
||||
|
||||
`yarn.nodemanager.pmem-check-enabled` or `yarn.nodemanager.vmem-check-enabled` should be `true`. If swapping is turned off the former should be set, the latter should be set otherwise.
|
||||
|
||||
|
||||
Configuring elastic memory control and strict container memory enforcement through cgroups
|
||||
------------------------------------------
|
||||
ADVANCED ONLY
|
||||
Elastic memory control and strict container memory enforcement can be enabled at the same time to allow Node Manager to over-allocate itself.
|
||||
However, elastic memory control changes how strict container memory enforcement through cgroups is performed. Elastic memory control
|
||||
disables the oom killer on the root yarn container cgroup. The oom killer setting overrides that of individual container cgroups, so individual
|
||||
containers won't be killed by the oom killer when they go over their memory limit. The strict container memory enforcement in this case falls
|
||||
back to the polling-based mechanism.
|
||||
|
|
Loading…
Reference in New Issue