YARN-4744. Too many signal to container failure in case of LCE. Contributed by Sidharta Seethana

(cherry picked from commit 059caf9989)
This commit is contained in:
Jason Lowe 2016-03-07 15:40:01 +00:00
parent ed421cb128
commit 4eace7ab43
9 changed files with 48 additions and 26 deletions

View File

@ -181,7 +181,7 @@ public void init() throws IOException {
// verify configuration/permissions and exit
try {
PrivilegedOperation checkSetupOp = new PrivilegedOperation(
PrivilegedOperation.OperationType.CHECK_SETUP, (String) null);
PrivilegedOperation.OperationType.CHECK_SETUP);
PrivilegedOperationExecutor privilegedOperationExecutor =
PrivilegedOperationExecutor.getInstance(conf);
@ -242,7 +242,7 @@ public void startLocalizer(LocalizerStartContext ctx)
verifyUsernamePattern(user);
String runAsUser = getRunAsUser(user);
PrivilegedOperation initializeContainerOp = new PrivilegedOperation(
PrivilegedOperation.OperationType.INITIALIZE_CONTAINER, (String) null);
PrivilegedOperation.OperationType.INITIALIZE_CONTAINER);
List<String> prefixCommands = new ArrayList<>();
addSchedPriorityCommand(prefixCommands);

View File

@ -68,10 +68,16 @@ public String getOption() {
private final OperationType opType;
private final List<String> args;
private boolean failureLogging;
public PrivilegedOperation(OperationType opType, String arg) {
public PrivilegedOperation(OperationType opType) {
this.opType = opType;
this.args = new ArrayList<String>();
this.failureLogging = true;
}
public PrivilegedOperation(OperationType opType, String arg) {
this(opType);
if (arg != null) {
this.args.add(arg);
@ -79,8 +85,7 @@ public PrivilegedOperation(OperationType opType, String arg) {
}
public PrivilegedOperation(OperationType opType, List<String> args) {
this.opType = opType;
this.args = new ArrayList<String>();
this(opType);
if (args != null) {
this.args.addAll(args);
@ -97,6 +102,18 @@ public void appendArgs(List<String> args) {
this.args.addAll(args);
}
public void enableFailureLogging() {
this.failureLogging = true;
}
public void disableFailureLogging() {
this.failureLogging = false;
}
public boolean isFailureLoggingEnabled() {
return failureLogging;
}
public OperationType getOperationType() {
return opType;
}

View File

@ -155,6 +155,8 @@ public String executePrivilegedOperation(List<String> prefixCommands,
LOG.debug(exec.getOutput());
}
} catch (ExitCodeException e) {
if (operation.isFailureLoggingEnabled()) {
StringBuilder logBuilder = new StringBuilder("Shell execution returned "
+ "exit code: ")
.append(exec.getExitCode())
@ -166,6 +168,7 @@ public String executePrivilegedOperation(List<String> prefixCommands,
logBuilder.append(Arrays.toString(fullCommandArray));
LOG.warn(logBuilder.toString());
}
//stderr from shell executor seems to be stuffed into the exception
//'message' - so, we have to extract it and set it as the error out

View File

@ -247,7 +247,7 @@ public void mountCGroupController(CGroupController controller)
.append(controller.getName()).append('=').append(controllerPath);
PrivilegedOperation.OperationType opType = PrivilegedOperation
.OperationType.MOUNT_CGROUPS;
PrivilegedOperation op = new PrivilegedOperation(opType, (String) null);
PrivilegedOperation op = new PrivilegedOperation(opType);
op.appendArgs(hierarchy, cGroupKV.toString());
LOG.info("Mounting controller " + controller.getName() + " at " +

View File

@ -546,7 +546,7 @@ public BatchBuilder(PrivilegedOperation.OperationType opType)
case TC_MODIFY_STATE:
case TC_READ_STATE:
case TC_READ_STATS:
operation = new PrivilegedOperation(opType, (String) null);
operation = new PrivilegedOperation(opType);
commands = new ArrayList<>();
break;
default:

View File

@ -67,7 +67,7 @@ public void launchContainer(ContainerRuntimeContext ctx)
throws ContainerExecutionException {
Container container = ctx.getContainer();
PrivilegedOperation launchOp = new PrivilegedOperation(
PrivilegedOperation.OperationType.LAUNCH_CONTAINER, (String) null);
PrivilegedOperation.OperationType.LAUNCH_CONTAINER);
//All of these arguments are expected to be available in the runtime context
launchOp.appendArgs(ctx.getExecutionAttribute(RUN_AS_USER),
@ -116,7 +116,7 @@ public void signalContainer(ContainerRuntimeContext ctx)
throws ContainerExecutionException {
Container container = ctx.getContainer();
PrivilegedOperation signalOp = new PrivilegedOperation(
PrivilegedOperation.OperationType.SIGNAL_CONTAINER, (String) null);
PrivilegedOperation.OperationType.SIGNAL_CONTAINER);
signalOp.appendArgs(ctx.getExecutionAttribute(RUN_AS_USER),
ctx.getExecutionAttribute(USER),
@ -125,6 +125,9 @@ public void signalContainer(ContainerRuntimeContext ctx)
ctx.getExecutionAttribute(PID),
Integer.toString(ctx.getExecutionAttribute(SIGNAL).getValue()));
//Some failures here are acceptable. Let the calling executor decide.
signalOp.disableFailureLogging();
try {
PrivilegedOperationExecutor executor = PrivilegedOperationExecutor
.getInstance(conf);
@ -133,8 +136,8 @@ public void signalContainer(ContainerRuntimeContext ctx)
signalOp, null, container.getLaunchContext().getEnvironment(),
false);
} catch (PrivilegedOperationException e) {
LOG.warn("Signal container failed. Exception: ", e);
//Don't log the failure here. Some kinds of signaling failures are
// acceptable. Let the calling executor decide what to do.
throw new ContainerExecutionException("Signal container failed", e
.getExitCode(), e.getOutput(), e.getErrorOutput());
}

View File

@ -280,8 +280,7 @@ public void launchContainer(ContainerRuntimeContext ctx)
String commandFile = dockerClient.writeCommandToTempFile(runCommand,
containerIdStr);
PrivilegedOperation launchOp = new PrivilegedOperation(
PrivilegedOperation.OperationType.LAUNCH_DOCKER_CONTAINER, (String)
null);
PrivilegedOperation.OperationType.LAUNCH_DOCKER_CONTAINER);
launchOp.appendArgs(runAsUser, ctx.getExecutionAttribute(USER),
Integer.toString(PrivilegedOperation
@ -321,7 +320,7 @@ public void signalContainer(ContainerRuntimeContext ctx)
throws ContainerExecutionException {
Container container = ctx.getContainer();
PrivilegedOperation signalOp = new PrivilegedOperation(
PrivilegedOperation.OperationType.SIGNAL_CONTAINER, (String) null);
PrivilegedOperation.OperationType.SIGNAL_CONTAINER);
signalOp.appendArgs(ctx.getExecutionAttribute(RUN_AS_USER),
ctx.getExecutionAttribute(USER),

View File

@ -69,7 +69,7 @@ public void setup() {
cGroupTasks2 = "net_cls/hadoop_yarn/container_01/tasks";
cGroupTasks3 = "blkio/hadoop_yarn/container_01/tasks";
opDisallowed = new PrivilegedOperation
(PrivilegedOperation.OperationType.DELETE_AS_USER, (String) null);
(PrivilegedOperation.OperationType.DELETE_AS_USER);
opTasksNone = new PrivilegedOperation
(PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP,
PrivilegedOperation.CGROUP_ARG_PREFIX + cGroupTasksNone);
@ -118,7 +118,7 @@ public void testExecutionCommand() {
PrivilegedOperationExecutor exec = PrivilegedOperationExecutor
.getInstance(confWithExecutorPath);
PrivilegedOperation op = new PrivilegedOperation(PrivilegedOperation
.OperationType.TC_MODIFY_STATE, (String) null);
.OperationType.TC_MODIFY_STATE);
String[] cmdArray = exec.getPrivilegedOperationExecutionCommand(null, op);
//No arguments added - so the resulting array should consist of

View File

@ -89,7 +89,7 @@ public void testMountController() {
cGroupsHandler = new CGroupsHandlerImpl(conf,
privilegedOperationExecutorMock);
PrivilegedOperation expectedOp = new PrivilegedOperation(
PrivilegedOperation.OperationType.MOUNT_CGROUPS, (String) null);
PrivilegedOperation.OperationType.MOUNT_CGROUPS);
//This is expected to be of the form :
//net_cls=<mount_path>/net_cls
StringBuffer controllerKV = new StringBuffer(controller.getName())