YARN-8706. Updated docker container stop logic to avoid double kill.
Contributed by Chandni Singh
This commit is contained in:
parent
7d62334387
commit
bf8a1750e9
|
@ -1995,7 +1995,10 @@ public class YarnConfiguration extends Configuration {
|
|||
* A configurable value to pass to the Docker Stop command. This value
|
||||
* defines the number of seconds between the docker stop command sending
|
||||
* a SIGTERM and a SIGKILL.
|
||||
*
|
||||
* @deprecated use {@link YarnConfiguration#NM_SLEEP_DELAY_BEFORE_SIGKILL_MS}
|
||||
*/
|
||||
@Deprecated
|
||||
public static final String NM_DOCKER_STOP_GRACE_PERIOD =
|
||||
DOCKER_CONTAINER_RUNTIME_PREFIX + "stop.grace-period";
|
||||
|
||||
|
@ -2003,6 +2006,7 @@ public class YarnConfiguration extends Configuration {
|
|||
* The default value for the grace period between the SIGTERM and the
|
||||
* SIGKILL in the Docker Stop command.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final int DEFAULT_NM_DOCKER_STOP_GRACE_PERIOD = 10;
|
||||
|
||||
/** The default list of read-only mounts to be bind-mounted into all
|
||||
|
|
|
@ -58,7 +58,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerClient;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerInspectCommand;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerRunCommand;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker.DockerStopCommand;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntime;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
|
||||
|
@ -245,7 +244,6 @@ public class DockerLinuxContainerRuntime implements LinuxContainerRuntime {
|
|||
private int userRemappingGidThreshold;
|
||||
private Set<String> capabilities;
|
||||
private boolean delayedRemovalAllowed;
|
||||
private int dockerStopGracePeriod;
|
||||
private Set<String> defaultROMounts = new HashSet<>();
|
||||
private Set<String> defaultRWMounts = new HashSet<>();
|
||||
private Set<String> defaultTmpfsMounts = new HashSet<>();
|
||||
|
@ -356,10 +354,6 @@ public class DockerLinuxContainerRuntime implements LinuxContainerRuntime {
|
|||
YarnConfiguration.NM_DOCKER_ALLOW_DELAYED_REMOVAL,
|
||||
YarnConfiguration.DEFAULT_NM_DOCKER_ALLOW_DELAYED_REMOVAL);
|
||||
|
||||
dockerStopGracePeriod = conf.getInt(
|
||||
YarnConfiguration.NM_DOCKER_STOP_GRACE_PERIOD,
|
||||
YarnConfiguration.DEFAULT_NM_DOCKER_STOP_GRACE_PERIOD);
|
||||
|
||||
defaultROMounts.addAll(Arrays.asList(
|
||||
conf.getTrimmedStrings(
|
||||
YarnConfiguration.NM_DOCKER_DEFAULT_RO_MOUNTS)));
|
||||
|
@ -1084,7 +1078,7 @@ public class DockerLinuxContainerRuntime implements LinuxContainerRuntime {
|
|||
if (ContainerExecutor.Signal.NULL.equals(signal)) {
|
||||
executeLivelinessCheck(ctx);
|
||||
} else if (ContainerExecutor.Signal.TERM.equals(signal)) {
|
||||
String containerId = ctx.getContainer().getContainerId().toString();
|
||||
ContainerId containerId = ctx.getContainer().getContainerId();
|
||||
handleContainerStop(containerId, env);
|
||||
} else {
|
||||
handleContainerKill(ctx, env, signal);
|
||||
|
@ -1137,14 +1131,7 @@ public class DockerLinuxContainerRuntime implements LinuxContainerRuntime {
|
|||
DockerInspectCommand inspectCommand =
|
||||
new DockerInspectCommand(containerIdStr).getIpAndHost();
|
||||
try {
|
||||
String commandFile = dockerClient.writeCommandToTempFile(inspectCommand,
|
||||
containerId, nmContext);
|
||||
PrivilegedOperation privOp = new PrivilegedOperation(
|
||||
PrivilegedOperation.OperationType.RUN_DOCKER_CMD);
|
||||
privOp.appendArgs(commandFile);
|
||||
String output = privilegedOperationExecutor
|
||||
.executePrivilegedOperation(null, privOp, null,
|
||||
null, true, false);
|
||||
String output = executeDockerInspect(containerId, inspectCommand);
|
||||
LOG.info("Docker inspect output for " + containerId + ": " + output);
|
||||
// strip off quotes if any
|
||||
output = output.replaceAll("['\"]", "");
|
||||
|
@ -1266,23 +1253,74 @@ public class DockerLinuxContainerRuntime implements LinuxContainerRuntime {
|
|||
}
|
||||
}
|
||||
|
||||
private void handleContainerStop(String containerId, Map<String, String> env)
|
||||
/**
|
||||
* Handles a docker container stop by first finding the {@code STOPSIGNAL}
|
||||
* using docker inspect and then executing
|
||||
* {@code docker kill --signal=<STOPSIGNAL>}.
|
||||
* It doesn't rely on the docker stop because that sends a {@code SIGKILL}
|
||||
* to the root process in the container after the {@code STOPSIGNAL}.The grace
|
||||
* period which the docker stop uses has granularity in seconds. However, NM
|
||||
* is designed to explicitly send a {@code SIGKILL} to the containers after a
|
||||
* grace period which has a granularity of millis. It doesn't want the docker
|
||||
* stop to send {@code SIGKILL} but docker stop has no option to disallow
|
||||
* that.
|
||||
*
|
||||
* @param containerId container id
|
||||
* @param env env
|
||||
* @throws ContainerExecutionException
|
||||
*/
|
||||
private void handleContainerStop(ContainerId containerId,
|
||||
Map<String, String> env)
|
||||
throws ContainerExecutionException {
|
||||
|
||||
DockerCommandExecutor.DockerContainerStatus containerStatus =
|
||||
DockerCommandExecutor.getContainerStatus(containerId,
|
||||
privilegedOperationExecutor, nmContext);
|
||||
if (DockerCommandExecutor.isStoppable(containerStatus)) {
|
||||
DockerStopCommand dockerStopCommand = new DockerStopCommand(
|
||||
containerId).setGracePeriod(dockerStopGracePeriod);
|
||||
DockerCommandExecutor.executeDockerCommand(dockerStopCommand, containerId,
|
||||
env, privilegedOperationExecutor, false, nmContext);
|
||||
} else {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(
|
||||
"Container status is " + containerStatus.getName()
|
||||
+ ", skipping stop - " + containerId);
|
||||
DockerCommandExecutor.DockerContainerStatus.UNKNOWN;
|
||||
String stopSignal = ContainerExecutor.Signal.TERM.toString();
|
||||
char delimiter = ',';
|
||||
DockerInspectCommand inspectCommand =
|
||||
new DockerInspectCommand(containerId.toString()).get(new String[] {
|
||||
DockerInspectCommand.STATUS_TEMPLATE,
|
||||
DockerInspectCommand.STOPSIGNAL_TEMPLATE}, delimiter);
|
||||
try {
|
||||
String output = executeDockerInspect(containerId, inspectCommand);
|
||||
|
||||
if (!output.isEmpty()) {
|
||||
String[] statusAndSignal = StringUtils.split(output, delimiter);
|
||||
containerStatus = DockerCommandExecutor.parseContainerStatus(
|
||||
statusAndSignal[0]);
|
||||
if (statusAndSignal.length > 1) {
|
||||
stopSignal = statusAndSignal[1];
|
||||
}
|
||||
}
|
||||
} catch (ContainerExecutionException | PrivilegedOperationException e) {
|
||||
LOG.debug("{} inspect failed, skipping stop", containerId, e);
|
||||
return;
|
||||
}
|
||||
|
||||
if (DockerCommandExecutor.isStoppable(containerStatus)) {
|
||||
|
||||
DockerKillCommand dockerStopCommand = new DockerKillCommand(
|
||||
containerId.toString()).setSignal(stopSignal);
|
||||
DockerCommandExecutor.executeDockerCommand(dockerStopCommand,
|
||||
containerId.toString(), env, privilegedOperationExecutor, false,
|
||||
nmContext);
|
||||
} else {
|
||||
LOG.debug("{} status is {}, skipping stop", containerId, containerStatus);
|
||||
}
|
||||
}
|
||||
|
||||
private String executeDockerInspect(ContainerId containerId,
|
||||
DockerInspectCommand inspectCommand) throws ContainerExecutionException,
|
||||
PrivilegedOperationException {
|
||||
String commandFile = dockerClient.writeCommandToTempFile(inspectCommand,
|
||||
containerId, nmContext);
|
||||
PrivilegedOperation privOp = new PrivilegedOperation(
|
||||
PrivilegedOperation.OperationType.RUN_DOCKER_CMD);
|
||||
privOp.appendArgs(commandFile);
|
||||
String output = privilegedOperationExecutor.executePrivilegedOperation(null,
|
||||
privOp, null, null, true, false);
|
||||
LOG.info("{} : docker inspect output {} ", containerId, output);
|
||||
return output;
|
||||
}
|
||||
|
||||
private void handleContainerKill(ContainerRuntimeContext ctx,
|
||||
|
|
|
@ -113,39 +113,11 @@ public final class DockerCommandExecutor {
|
|||
PrivilegedOperationExecutor privilegedOperationExecutor,
|
||||
Context nmContext) {
|
||||
try {
|
||||
DockerContainerStatus dockerContainerStatus;
|
||||
String currentContainerStatus =
|
||||
executeStatusCommand(containerId,
|
||||
privilegedOperationExecutor, nmContext);
|
||||
if (currentContainerStatus == null) {
|
||||
dockerContainerStatus = DockerContainerStatus.UNKNOWN;
|
||||
} else if (currentContainerStatus
|
||||
.equals(DockerContainerStatus.CREATED.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.CREATED;
|
||||
} else if (currentContainerStatus
|
||||
.equals(DockerContainerStatus.RUNNING.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.RUNNING;
|
||||
} else if (currentContainerStatus
|
||||
.equals(DockerContainerStatus.STOPPED.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.STOPPED;
|
||||
} else if (currentContainerStatus
|
||||
.equals(DockerContainerStatus.RESTARTING.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.RESTARTING;
|
||||
} else if (currentContainerStatus
|
||||
.equals(DockerContainerStatus.REMOVING.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.REMOVING;
|
||||
} else if (currentContainerStatus
|
||||
.equals(DockerContainerStatus.DEAD.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.DEAD;
|
||||
} else if (currentContainerStatus
|
||||
.equals(DockerContainerStatus.EXITED.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.EXITED;
|
||||
} else if (currentContainerStatus
|
||||
.equals(DockerContainerStatus.NONEXISTENT.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.NONEXISTENT;
|
||||
} else {
|
||||
dockerContainerStatus = DockerContainerStatus.UNKNOWN;
|
||||
}
|
||||
DockerContainerStatus dockerContainerStatus = parseContainerStatus(
|
||||
currentContainerStatus);
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Container Status: " + dockerContainerStatus.getName()
|
||||
+ " ContainerId: " + containerId);
|
||||
|
@ -161,6 +133,47 @@ public final class DockerCommandExecutor {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the container status string.
|
||||
*
|
||||
* @param containerStatusStr container status.
|
||||
* @return a {@link DockerContainerStatus} representing the status.
|
||||
*/
|
||||
public static DockerContainerStatus parseContainerStatus(
|
||||
String containerStatusStr) {
|
||||
DockerContainerStatus dockerContainerStatus;
|
||||
if (containerStatusStr == null) {
|
||||
dockerContainerStatus = DockerContainerStatus.UNKNOWN;
|
||||
} else if (containerStatusStr
|
||||
.equals(DockerContainerStatus.CREATED.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.CREATED;
|
||||
} else if (containerStatusStr
|
||||
.equals(DockerContainerStatus.RUNNING.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.RUNNING;
|
||||
} else if (containerStatusStr
|
||||
.equals(DockerContainerStatus.STOPPED.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.STOPPED;
|
||||
} else if (containerStatusStr
|
||||
.equals(DockerContainerStatus.RESTARTING.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.RESTARTING;
|
||||
} else if (containerStatusStr
|
||||
.equals(DockerContainerStatus.REMOVING.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.REMOVING;
|
||||
} else if (containerStatusStr
|
||||
.equals(DockerContainerStatus.DEAD.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.DEAD;
|
||||
} else if (containerStatusStr
|
||||
.equals(DockerContainerStatus.EXITED.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.EXITED;
|
||||
} else if (containerStatusStr
|
||||
.equals(DockerContainerStatus.NONEXISTENT.getName())) {
|
||||
dockerContainerStatus = DockerContainerStatus.NONEXISTENT;
|
||||
} else {
|
||||
dockerContainerStatus = DockerContainerStatus.UNKNOWN;
|
||||
}
|
||||
return dockerContainerStatus;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the docker inspect command to retrieve the docker container's
|
||||
* status.
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.docker;
|
||||
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
|
||||
|
||||
|
@ -39,8 +40,8 @@ public class DockerInspectCommand extends DockerCommand {
|
|||
}
|
||||
|
||||
public DockerInspectCommand getContainerStatus() {
|
||||
super.addCommandArguments("format", "{{.State.Status}}");
|
||||
this.commandArguments = "--format={{.State.Status}}";
|
||||
super.addCommandArguments("format", STATUS_TEMPLATE);
|
||||
this.commandArguments = String.format("--format=%s", STATUS_TEMPLATE);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -54,6 +55,14 @@ public class DockerInspectCommand extends DockerCommand {
|
|||
+ "{{.IPAddress}},{{end}}{{.Config.Hostname}}";
|
||||
return this;
|
||||
}
|
||||
|
||||
public DockerInspectCommand get(String[] templates, char delimiter) {
|
||||
String format = StringUtils.join(delimiter, templates);
|
||||
super.addCommandArguments("format", format);
|
||||
this.commandArguments = String.format("--format=%s", format);
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public PrivilegedOperation preparePrivilegedOperation(
|
||||
DockerCommand dockerCommand, String containerName, Map<String,
|
||||
|
@ -63,4 +72,7 @@ public class DockerInspectCommand extends DockerCommand {
|
|||
dockerOp.appendArgs(commandArguments, containerName);
|
||||
return dockerOp;
|
||||
}
|
||||
|
||||
public static final String STATUS_TEMPLATE = "{{.State.Status}}";
|
||||
public static final String STOPSIGNAL_TEMPLATE = "{{.Config.StopSignal}}";
|
||||
}
|
||||
|
|
|
@ -155,7 +155,6 @@ public class TestDockerContainerRuntime {
|
|||
private final String whitelistedUser = "yoda";
|
||||
private String[] testCapabilities;
|
||||
private final String signalPid = "1234";
|
||||
private int dockerStopGracePeriod;
|
||||
|
||||
@Rule
|
||||
public TemporaryFolder tempDir = new TemporaryFolder();
|
||||
|
@ -181,10 +180,6 @@ public class TestDockerContainerRuntime {
|
|||
image = "busybox:latest";
|
||||
nmContext = createMockNMContext();
|
||||
|
||||
dockerStopGracePeriod = conf.getInt(
|
||||
YarnConfiguration.NM_DOCKER_STOP_GRACE_PERIOD,
|
||||
YarnConfiguration.DEFAULT_NM_DOCKER_STOP_GRACE_PERIOD);
|
||||
|
||||
env.put(DockerLinuxContainerRuntime.ENV_DOCKER_CONTAINER_IMAGE, image);
|
||||
when(container.getContainerId()).thenReturn(cId);
|
||||
when(cId.toString()).thenReturn(containerId);
|
||||
|
@ -1655,13 +1650,23 @@ public class TestDockerContainerRuntime {
|
|||
DockerCommandExecutor.DockerContainerStatus.RUNNING.getName());
|
||||
List<String> dockerCommands = getDockerCommandsForDockerStop(
|
||||
ContainerExecutor.Signal.TERM);
|
||||
Assert.assertEquals(4, dockerCommands.size());
|
||||
Assert.assertEquals("[docker-command-execution]", dockerCommands.get(0));
|
||||
Assert.assertEquals(" docker-command=stop", dockerCommands.get(1));
|
||||
Assert.assertEquals(
|
||||
" name=container_e11_1518975676334_14532816_01_000001",
|
||||
dockerCommands.get(2));
|
||||
Assert.assertEquals(" time=10", dockerCommands.get(3));
|
||||
verifyStopCommand(dockerCommands, ContainerExecutor.Signal.TERM.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
@SuppressWarnings("unchecked")
|
||||
public void testDockerStopWithQuitSignalWhenRunning()
|
||||
throws ContainerExecutionException, PrivilegedOperationException,
|
||||
IOException {
|
||||
when(mockExecutor
|
||||
.executePrivilegedOperation(anyList(), any(PrivilegedOperation.class),
|
||||
any(File.class), anyMap(), anyBoolean(), anyBoolean())).thenReturn(
|
||||
DockerCommandExecutor.DockerContainerStatus.RUNNING.getName() +
|
||||
",SIGQUIT");
|
||||
|
||||
List<String> dockerCommands = getDockerCommandsForDockerStop(
|
||||
ContainerExecutor.Signal.TERM);
|
||||
verifyStopCommand(dockerCommands, "SIGQUIT");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -1714,13 +1719,7 @@ public class TestDockerContainerRuntime {
|
|||
DockerCommandExecutor.DockerContainerStatus.RUNNING.getName());
|
||||
List<String> dockerCommands = getDockerCommandsForDockerStop(
|
||||
ContainerExecutor.Signal.TERM);
|
||||
Assert.assertEquals(4, dockerCommands.size());
|
||||
Assert.assertEquals("[docker-command-execution]", dockerCommands.get(0));
|
||||
Assert.assertEquals(" docker-command=stop", dockerCommands.get(1));
|
||||
Assert.assertEquals(
|
||||
" name=container_e11_1518975676334_14532816_01_000001",
|
||||
dockerCommands.get(2));
|
||||
Assert.assertEquals(" time=10", dockerCommands.get(3));
|
||||
verifyStopCommand(dockerCommands, ContainerExecutor.Signal.TERM.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -2351,4 +2350,14 @@ public class TestDockerContainerRuntime {
|
|||
" name=container_e11_1518975676334_14532816_01_000001",
|
||||
dockerCommands.get(counter));
|
||||
}
|
||||
|
||||
private static void verifyStopCommand(List<String> dockerCommands,
|
||||
String signal) {
|
||||
Assert.assertEquals(4, dockerCommands.size());
|
||||
Assert.assertEquals("[docker-command-execution]", dockerCommands.get(0));
|
||||
Assert.assertEquals(" docker-command=kill", dockerCommands.get(1));
|
||||
Assert.assertEquals(" name=container_e11_1518975676334_14532816_01_000001",
|
||||
dockerCommands.get(2));
|
||||
Assert.assertEquals(" signal=" + signal, dockerCommands.get(3));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue