YARN-8259. Improve privileged docker container liveliness checks.
Contributed by Shane Kumpf
This commit is contained in:
parent
69b0596897
commit
22994889dc
|
@ -191,6 +191,7 @@ public class DockerLinuxContainerRuntime implements LinuxContainerRuntime {
|
|||
private static final Pattern USER_MOUNT_PATTERN = Pattern.compile(
|
||||
"(?<=^|,)([^:\\x00]+):([^:\\x00]+):([a-z]+)");
|
||||
private static final int HOST_NAME_LENGTH = 64;
|
||||
private static final String DEFAULT_PROCFS = "/proc";
|
||||
|
||||
@InterfaceAudience.Private
|
||||
public static final String ENV_DOCKER_CONTAINER_IMAGE =
|
||||
|
@ -1192,24 +1193,15 @@ public class DockerLinuxContainerRuntime implements LinuxContainerRuntime {
|
|||
|
||||
private void executeLivelinessCheck(ContainerRuntimeContext ctx)
|
||||
throws ContainerExecutionException {
|
||||
PrivilegedOperation signalOp = new PrivilegedOperation(
|
||||
PrivilegedOperation.OperationType.SIGNAL_CONTAINER);
|
||||
signalOp.appendArgs(ctx.getExecutionAttribute(RUN_AS_USER),
|
||||
ctx.getExecutionAttribute(USER), Integer.toString(
|
||||
PrivilegedOperation.RunAsUserCommand.SIGNAL_CONTAINER.getValue()),
|
||||
ctx.getExecutionAttribute(PID),
|
||||
Integer.toString(ctx.getExecutionAttribute(SIGNAL).getValue()));
|
||||
signalOp.disableFailureLogging();
|
||||
try {
|
||||
privilegedOperationExecutor.executePrivilegedOperation(null, signalOp,
|
||||
null, ctx.getContainer().getLaunchContext().getEnvironment(), false,
|
||||
false);
|
||||
} catch (PrivilegedOperationException e) {
|
||||
String msg = "Liveliness check failed for PID: "
|
||||
+ ctx.getExecutionAttribute(PID)
|
||||
String procFs = ctx.getExecutionAttribute(PROCFS);
|
||||
if (procFs == null || procFs.isEmpty()) {
|
||||
procFs = DEFAULT_PROCFS;
|
||||
}
|
||||
String pid = ctx.getExecutionAttribute(PID);
|
||||
if (!new File(procFs + File.separator + pid).exists()) {
|
||||
String msg = "Liveliness check failed for PID: " + pid
|
||||
+ ". Container may have already completed.";
|
||||
throw new ContainerExecutionException(msg, e.getExitCode(), e.getOutput(),
|
||||
e.getErrorOutput());
|
||||
throw new ContainerExecutionException(msg);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -89,4 +89,6 @@ public final class LinuxContainerRuntimeConstants {
|
|||
String.class, "pid");
|
||||
public static final Attribute<ContainerExecutor.Signal> SIGNAL = Attribute
|
||||
.attribute(ContainerExecutor.Signal.class, "signal");
|
||||
public static final Attribute<String> PROCFS = Attribute.attribute(
|
||||
String.class, "procfs");
|
||||
}
|
|
@ -54,7 +54,9 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.Contai
|
|||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeContext;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
import org.mockito.ArgumentCaptor;
|
||||
import org.mockito.Mockito;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -98,6 +100,7 @@ import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.r
|
|||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntimeConstants.NM_PRIVATE_TOKENS_PATH;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntimeConstants.PID;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntimeConstants.PID_FILE_PATH;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntimeConstants.PROCFS;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntimeConstants.RESOURCES_OPTIONS;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntimeConstants.RUN_AS_USER;
|
||||
import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntimeConstants.SIGNAL;
|
||||
|
@ -153,6 +156,9 @@ public class TestDockerContainerRuntime {
|
|||
private final String signalPid = "1234";
|
||||
private int dockerStopGracePeriod;
|
||||
|
||||
@Rule
|
||||
public TemporaryFolder tempDir = new TemporaryFolder();
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
String tmpPath = new StringBuffer(System.getProperty("test.build.data"))
|
||||
|
@ -1470,9 +1476,24 @@ public class TestDockerContainerRuntime {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testContainerLivelinessCheck()
|
||||
throws ContainerExecutionException, PrivilegedOperationException {
|
||||
public void testContainerLivelinessFileExistsNoException() throws Exception {
|
||||
File testTempDir = tempDir.newFolder();
|
||||
File procPidPath = new File(testTempDir + File.separator + signalPid);
|
||||
procPidPath.createNewFile();
|
||||
procPidPath.deleteOnExit();
|
||||
DockerLinuxContainerRuntime runtime = new DockerLinuxContainerRuntime(
|
||||
mockExecutor, mockCGroupsHandler);
|
||||
builder.setExecutionAttribute(RUN_AS_USER, runAsUser)
|
||||
.setExecutionAttribute(USER, user)
|
||||
.setExecutionAttribute(PID, signalPid)
|
||||
.setExecutionAttribute(SIGNAL, ContainerExecutor.Signal.NULL)
|
||||
.setExecutionAttribute(PROCFS, testTempDir.getAbsolutePath());
|
||||
runtime.initialize(enableMockContainerExecutor(conf), null);
|
||||
runtime.signalContainer(builder.build());
|
||||
}
|
||||
|
||||
@Test(expected = ContainerExecutionException.class)
|
||||
public void testContainerLivelinessNoFileException() throws Exception {
|
||||
DockerLinuxContainerRuntime runtime = new DockerLinuxContainerRuntime(
|
||||
mockExecutor, mockCGroupsHandler);
|
||||
builder.setExecutionAttribute(RUN_AS_USER, runAsUser)
|
||||
|
@ -1481,15 +1502,6 @@ public class TestDockerContainerRuntime {
|
|||
.setExecutionAttribute(SIGNAL, ContainerExecutor.Signal.NULL);
|
||||
runtime.initialize(enableMockContainerExecutor(conf), null);
|
||||
runtime.signalContainer(builder.build());
|
||||
|
||||
PrivilegedOperation op = capturePrivilegedOperation();
|
||||
Assert.assertEquals(op.getOperationType(),
|
||||
PrivilegedOperation.OperationType.SIGNAL_CONTAINER);
|
||||
Assert.assertEquals(runAsUser, op.getArguments().get(0));
|
||||
Assert.assertEquals(submittingUser, op.getArguments().get(1));
|
||||
Assert.assertEquals("2", op.getArguments().get(2));
|
||||
Assert.assertEquals("1234", op.getArguments().get(3));
|
||||
Assert.assertEquals("0", op.getArguments().get(4));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -378,6 +378,21 @@ Trusted images are allowed to mount external devices such as HDFS via NFS gatewa
|
|||
|
||||
For [YARN Service HTTPD example](./yarn-service/Examples.html), container-executor.cfg must define centos docker registry to be trusted for the example to run.
|
||||
|
||||
Container Reacquisition Requirements
|
||||
------------------------------------
|
||||
On restart, the NodeManager, as part of the NodeManager's recovery process, will
|
||||
validate that a container is still running by checking for the existence of the
|
||||
container's PID directory in the /proc filesystem. For security purposes,
|
||||
operating system administrator may enable the _hidepid_ mount option for the
|
||||
/proc filesystem. If the _hidepid_ option is enabled, the _yarn_ user's primary
|
||||
group must be whitelisted by setting the gid mount flag similar to below.
|
||||
Without the _yarn_ user's primary group whitelisted, container reacquisition
|
||||
will fail and the container will be killed on NodeManager restart.
|
||||
|
||||
```
|
||||
proc /proc proc nosuid,nodev,noexec,hidepid=2,gid=yarn 0 0
|
||||
```
|
||||
|
||||
Connecting to a Secure Docker Repository
|
||||
----------------------------------------
|
||||
|
||||
|
|
Loading…
Reference in New Issue