From 059caf99891943d9587cac19b48e82efbed06b2d Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Mon, 7 Mar 2016 15:40:01 +0000 Subject: [PATCH] YARN-4744. Too many signal to container failure in case of LCE. Contributed by Sidharta Seethana --- .../nodemanager/LinuxContainerExecutor.java | 4 ++-- .../linux/privileged/PrivilegedOperation.java | 23 ++++++++++++++++--- .../PrivilegedOperationExecutor.java | 21 +++++++++-------- .../linux/resources/CGroupsHandlerImpl.java | 2 +- .../linux/resources/TrafficController.java | 2 +- .../runtime/DefaultLinuxContainerRuntime.java | 11 +++++---- .../runtime/DockerLinuxContainerRuntime.java | 5 ++-- .../TestPrivilegedOperationExecutor.java | 4 ++-- .../resources/TestCGroupsHandlerImpl.java | 2 +- 9 files changed, 48 insertions(+), 26 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java index 1367b13ab8d..5a48e0955c7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java @@ -181,7 +181,7 @@ public class LinuxContainerExecutor extends ContainerExecutor { // verify configuration/permissions and exit try { PrivilegedOperation checkSetupOp = new PrivilegedOperation( - PrivilegedOperation.OperationType.CHECK_SETUP, (String) null); + PrivilegedOperation.OperationType.CHECK_SETUP); PrivilegedOperationExecutor privilegedOperationExecutor = PrivilegedOperationExecutor.getInstance(conf); @@ -242,7 +242,7 @@ public class LinuxContainerExecutor extends ContainerExecutor { verifyUsernamePattern(user); String runAsUser = getRunAsUser(user); PrivilegedOperation initializeContainerOp = new PrivilegedOperation( - PrivilegedOperation.OperationType.INITIALIZE_CONTAINER, (String) null); + PrivilegedOperation.OperationType.INITIALIZE_CONTAINER); List prefixCommands = new ArrayList<>(); addSchedPriorityCommand(prefixCommands); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java index cbbf7a80c0b..259dee815a2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java @@ -68,10 +68,16 @@ public class PrivilegedOperation { private final OperationType opType; private final List args; + private boolean failureLogging; - public PrivilegedOperation(OperationType opType, String arg) { + public PrivilegedOperation(OperationType opType) { this.opType = opType; this.args = new ArrayList(); + this.failureLogging = true; + } + + public PrivilegedOperation(OperationType opType, String arg) { + this(opType); if (arg != null) { this.args.add(arg); @@ -79,8 +85,7 @@ public class PrivilegedOperation { } public PrivilegedOperation(OperationType opType, List args) { - this.opType = opType; - this.args = new ArrayList(); + this(opType); if (args != null) { this.args.addAll(args); @@ -97,6 +102,18 @@ public class PrivilegedOperation { this.args.addAll(args); } + public void enableFailureLogging() { + this.failureLogging = true; + } + + public void disableFailureLogging() { + this.failureLogging = false; + } + + public boolean isFailureLoggingEnabled() { + return failureLogging; + } + public OperationType getOperationType() { return opType; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperationExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperationExecutor.java index 4b1bb9f270e..7370daa1d5b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperationExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperationExecutor.java @@ -155,17 +155,20 @@ public class PrivilegedOperationExecutor { LOG.debug(exec.getOutput()); } } catch (ExitCodeException e) { - StringBuilder logBuilder = new StringBuilder("Shell execution returned " - + "exit code: ") - .append(exec.getExitCode()) - .append(". Privileged Execution Operation Output: ") - .append(System.lineSeparator()).append(exec.getOutput()); + if (operation.isFailureLoggingEnabled()) { - logBuilder.append("Full command array for failed execution: ") - .append(System.lineSeparator()); - logBuilder.append(Arrays.toString(fullCommandArray)); + StringBuilder logBuilder = new StringBuilder("Shell execution returned " + + "exit code: ") + .append(exec.getExitCode()) + .append(". Privileged Execution Operation Output: ") + .append(System.lineSeparator()).append(exec.getOutput()); - LOG.warn(logBuilder.toString()); + logBuilder.append("Full command array for failed execution: ") + .append(System.lineSeparator()); + logBuilder.append(Arrays.toString(fullCommandArray)); + + LOG.warn(logBuilder.toString()); + } //stderr from shell executor seems to be stuffed into the exception //'message' - so, we have to extract it and set it as the error out diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java index 1ee0f8ababd..36bd468f791 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsHandlerImpl.java @@ -247,7 +247,7 @@ class CGroupsHandlerImpl implements CGroupsHandler { .append(controller.getName()).append('=').append(controllerPath); PrivilegedOperation.OperationType opType = PrivilegedOperation .OperationType.MOUNT_CGROUPS; - PrivilegedOperation op = new PrivilegedOperation(opType, (String) null); + PrivilegedOperation op = new PrivilegedOperation(opType); op.appendArgs(hierarchy, cGroupKV.toString()); LOG.info("Mounting controller " + controller.getName() + " at " + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TrafficController.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TrafficController.java index e33cea45bf8..f1468fbac67 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TrafficController.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TrafficController.java @@ -546,7 +546,7 @@ import java.util.regex.Pattern; case TC_MODIFY_STATE: case TC_READ_STATE: case TC_READ_STATS: - operation = new PrivilegedOperation(opType, (String) null); + operation = new PrivilegedOperation(opType); commands = new ArrayList<>(); break; default: diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DefaultLinuxContainerRuntime.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DefaultLinuxContainerRuntime.java index 633fa668ae4..3862b92063e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DefaultLinuxContainerRuntime.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DefaultLinuxContainerRuntime.java @@ -67,7 +67,7 @@ public class DefaultLinuxContainerRuntime implements LinuxContainerRuntime { throws ContainerExecutionException { Container container = ctx.getContainer(); PrivilegedOperation launchOp = new PrivilegedOperation( - PrivilegedOperation.OperationType.LAUNCH_CONTAINER, (String) null); + PrivilegedOperation.OperationType.LAUNCH_CONTAINER); //All of these arguments are expected to be available in the runtime context launchOp.appendArgs(ctx.getExecutionAttribute(RUN_AS_USER), @@ -116,7 +116,7 @@ public class DefaultLinuxContainerRuntime implements LinuxContainerRuntime { throws ContainerExecutionException { Container container = ctx.getContainer(); PrivilegedOperation signalOp = new PrivilegedOperation( - PrivilegedOperation.OperationType.SIGNAL_CONTAINER, (String) null); + PrivilegedOperation.OperationType.SIGNAL_CONTAINER); signalOp.appendArgs(ctx.getExecutionAttribute(RUN_AS_USER), ctx.getExecutionAttribute(USER), @@ -125,6 +125,9 @@ public class DefaultLinuxContainerRuntime implements LinuxContainerRuntime { ctx.getExecutionAttribute(PID), Integer.toString(ctx.getExecutionAttribute(SIGNAL).getValue())); + //Some failures here are acceptable. Let the calling executor decide. + signalOp.disableFailureLogging(); + try { PrivilegedOperationExecutor executor = PrivilegedOperationExecutor .getInstance(conf); @@ -133,8 +136,8 @@ public class DefaultLinuxContainerRuntime implements LinuxContainerRuntime { signalOp, null, container.getLaunchContext().getEnvironment(), false); } catch (PrivilegedOperationException e) { - LOG.warn("Signal container failed. Exception: ", e); - + //Don't log the failure here. Some kinds of signaling failures are + // acceptable. Let the calling executor decide what to do. throw new ContainerExecutionException("Signal container failed", e .getExitCode(), e.getOutput(), e.getErrorOutput()); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java index 2dee663b33c..2b4fc791286 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/runtime/DockerLinuxContainerRuntime.java @@ -280,8 +280,7 @@ public class DockerLinuxContainerRuntime implements LinuxContainerRuntime { String commandFile = dockerClient.writeCommandToTempFile(runCommand, containerIdStr); PrivilegedOperation launchOp = new PrivilegedOperation( - PrivilegedOperation.OperationType.LAUNCH_DOCKER_CONTAINER, (String) - null); + PrivilegedOperation.OperationType.LAUNCH_DOCKER_CONTAINER); launchOp.appendArgs(runAsUser, ctx.getExecutionAttribute(USER), Integer.toString(PrivilegedOperation @@ -321,7 +320,7 @@ public class DockerLinuxContainerRuntime implements LinuxContainerRuntime { throws ContainerExecutionException { Container container = ctx.getContainer(); PrivilegedOperation signalOp = new PrivilegedOperation( - PrivilegedOperation.OperationType.SIGNAL_CONTAINER, (String) null); + PrivilegedOperation.OperationType.SIGNAL_CONTAINER); signalOp.appendArgs(ctx.getExecutionAttribute(RUN_AS_USER), ctx.getExecutionAttribute(USER), diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/TestPrivilegedOperationExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/TestPrivilegedOperationExecutor.java index 849dbabf20a..7146412f627 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/TestPrivilegedOperationExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/TestPrivilegedOperationExecutor.java @@ -69,7 +69,7 @@ public class TestPrivilegedOperationExecutor { cGroupTasks2 = "net_cls/hadoop_yarn/container_01/tasks"; cGroupTasks3 = "blkio/hadoop_yarn/container_01/tasks"; opDisallowed = new PrivilegedOperation - (PrivilegedOperation.OperationType.DELETE_AS_USER, (String) null); + (PrivilegedOperation.OperationType.DELETE_AS_USER); opTasksNone = new PrivilegedOperation (PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP, PrivilegedOperation.CGROUP_ARG_PREFIX + cGroupTasksNone); @@ -118,7 +118,7 @@ public class TestPrivilegedOperationExecutor { PrivilegedOperationExecutor exec = PrivilegedOperationExecutor .getInstance(confWithExecutorPath); PrivilegedOperation op = new PrivilegedOperation(PrivilegedOperation - .OperationType.TC_MODIFY_STATE, (String) null); + .OperationType.TC_MODIFY_STATE); String[] cmdArray = exec.getPrivilegedOperationExecutionCommand(null, op); //No arguments added - so the resulting array should consist of diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java index 50f8da6de95..76d56b4a725 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsHandlerImpl.java @@ -89,7 +89,7 @@ public class TestCGroupsHandlerImpl { cGroupsHandler = new CGroupsHandlerImpl(conf, privilegedOperationExecutorMock); PrivilegedOperation expectedOp = new PrivilegedOperation( - PrivilegedOperation.OperationType.MOUNT_CGROUPS, (String) null); + PrivilegedOperation.OperationType.MOUNT_CGROUPS); //This is expected to be of the form : //net_cls=/net_cls StringBuffer controllerKV = new StringBuffer(controller.getName())