YARN-1284. LCE: Race condition leaves dangling cgroups entries for killed containers. (Alejandro Abdelnur via Sandy Ryza)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1530492 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b7e885435a
commit
726c3538a7
|
@ -69,6 +69,9 @@ Release 2.3.0 - UNRELEASED
|
||||||
|
|
||||||
YARN-1268. TestFairScheduler.testContinuousScheduling is flaky (Sandy Ryza)
|
YARN-1268. TestFairScheduler.testContinuousScheduling is flaky (Sandy Ryza)
|
||||||
|
|
||||||
|
YARN-1284. LCE: Race condition leaves dangling cgroups entries for killed
|
||||||
|
containers. (Alejandro Abdelnur via Sandy Ryza)
|
||||||
|
|
||||||
Release 2.2.1 - UNRELEASED
|
Release 2.2.1 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -675,6 +675,18 @@ public class YarnConfiguration extends Configuration {
|
||||||
public static final String NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH =
|
public static final String NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH =
|
||||||
NM_PREFIX + "linux-container-executor.cgroups.mount-path";
|
NM_PREFIX + "linux-container-executor.cgroups.mount-path";
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Interval of time the linux container executor should try cleaning up
|
||||||
|
* cgroups entry when cleaning up a container. This is required due to what
|
||||||
|
* it seems a race condition because the SIGTERM/SIGKILL is asynch.
|
||||||
|
*/
|
||||||
|
public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
|
||||||
|
NM_PREFIX + "linux-container-executor.cgroups.delete-timeout-ms";
|
||||||
|
|
||||||
|
public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
|
||||||
|
1000;
|
||||||
|
|
||||||
/** T-file compression types used to compress aggregated logs.*/
|
/** T-file compression types used to compress aggregated logs.*/
|
||||||
public static final String NM_LOG_AGG_COMPRESSION_TYPE =
|
public static final String NM_LOG_AGG_COMPRESSION_TYPE =
|
||||||
NM_PREFIX + "log-aggregation.compression-type";
|
NM_PREFIX + "log-aggregation.compression-type";
|
||||||
|
|
|
@ -32,6 +32,7 @@ import java.util.Map.Entry;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
@ -40,6 +41,8 @@ import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
|
import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
|
||||||
|
import org.apache.hadoop.yarn.util.Clock;
|
||||||
|
import org.apache.hadoop.yarn.util.SystemClock;
|
||||||
|
|
||||||
public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
||||||
|
|
||||||
|
@ -59,8 +62,13 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
||||||
private final int CPU_DEFAULT_WEIGHT = 1024; // set by kernel
|
private final int CPU_DEFAULT_WEIGHT = 1024; // set by kernel
|
||||||
private final Map<String, String> controllerPaths; // Controller -> path
|
private final Map<String, String> controllerPaths; // Controller -> path
|
||||||
|
|
||||||
|
private long deleteCgroupTimeout;
|
||||||
|
// package private for testing purposes
|
||||||
|
Clock clock;
|
||||||
|
|
||||||
public CgroupsLCEResourcesHandler() {
|
public CgroupsLCEResourcesHandler() {
|
||||||
this.controllerPaths = new HashMap<String, String>();
|
this.controllerPaths = new HashMap<String, String>();
|
||||||
|
clock = new SystemClock();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -73,7 +81,8 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
||||||
return conf;
|
return conf;
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized void init(LinuxContainerExecutor lce) throws IOException {
|
@VisibleForTesting
|
||||||
|
void initConfig() throws IOException {
|
||||||
|
|
||||||
this.cgroupPrefix = conf.get(YarnConfiguration.
|
this.cgroupPrefix = conf.get(YarnConfiguration.
|
||||||
NM_LINUX_CONTAINER_CGROUPS_HIERARCHY, "/hadoop-yarn");
|
NM_LINUX_CONTAINER_CGROUPS_HIERARCHY, "/hadoop-yarn");
|
||||||
|
@ -82,6 +91,9 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
||||||
this.cgroupMountPath = conf.get(YarnConfiguration.
|
this.cgroupMountPath = conf.get(YarnConfiguration.
|
||||||
NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH, null);
|
NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH, null);
|
||||||
|
|
||||||
|
this.deleteCgroupTimeout = conf.getLong(
|
||||||
|
YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT,
|
||||||
|
YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT);
|
||||||
// remove extra /'s at end or start of cgroupPrefix
|
// remove extra /'s at end or start of cgroupPrefix
|
||||||
if (cgroupPrefix.charAt(0) == '/') {
|
if (cgroupPrefix.charAt(0) == '/') {
|
||||||
cgroupPrefix = cgroupPrefix.substring(1);
|
cgroupPrefix = cgroupPrefix.substring(1);
|
||||||
|
@ -91,6 +103,10 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
||||||
if (cgroupPrefix.charAt(len - 1) == '/') {
|
if (cgroupPrefix.charAt(len - 1) == '/') {
|
||||||
cgroupPrefix = cgroupPrefix.substring(0, len - 1);
|
cgroupPrefix = cgroupPrefix.substring(0, len - 1);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void init(LinuxContainerExecutor lce) throws IOException {
|
||||||
|
initConfig();
|
||||||
|
|
||||||
// mount cgroups if requested
|
// mount cgroups if requested
|
||||||
if (cgroupMount && cgroupMountPath != null) {
|
if (cgroupMount && cgroupMountPath != null) {
|
||||||
|
@ -158,14 +174,32 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void deleteCgroup(String controller, String groupName) {
|
@VisibleForTesting
|
||||||
String path = pathForCgroup(controller, groupName);
|
boolean deleteCgroup(String cgroupPath) {
|
||||||
|
boolean deleted;
|
||||||
|
|
||||||
LOG.debug("deleteCgroup: " + path);
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("deleteCgroup: " + cgroupPath);
|
||||||
if (! new File(path).delete()) {
|
|
||||||
LOG.warn("Unable to delete cgroup at: " + path);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
long start = clock.getTime();
|
||||||
|
do {
|
||||||
|
deleted = new File(cgroupPath).delete();
|
||||||
|
if (!deleted) {
|
||||||
|
try {
|
||||||
|
Thread.sleep(20);
|
||||||
|
} catch (InterruptedException ex) {
|
||||||
|
// NOP
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout);
|
||||||
|
|
||||||
|
if (!deleted) {
|
||||||
|
LOG.warn("Unable to delete cgroup at: " + cgroupPath +
|
||||||
|
", tried to delete for " + deleteCgroupTimeout + "ms");
|
||||||
|
}
|
||||||
|
|
||||||
|
return deleted;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -185,21 +219,8 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void clearLimits(ContainerId containerId) {
|
private void clearLimits(ContainerId containerId) {
|
||||||
String containerName = containerId.toString();
|
|
||||||
|
|
||||||
// Based on testing, ApplicationMaster executables don't terminate until
|
|
||||||
// a little after the container appears to have finished. Therefore, we
|
|
||||||
// wait a short bit for the cgroup to become empty before deleting it.
|
|
||||||
if (containerId.getId() == 1) {
|
|
||||||
try {
|
|
||||||
Thread.sleep(500);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
// not a problem, continue anyway
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isCpuWeightEnabled()) {
|
if (isCpuWeightEnabled()) {
|
||||||
deleteCgroup(CONTROLLER_CPU, containerName);
|
deleteCgroup(pathForCgroup(CONTROLLER_CPU, containerId.toString()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue