YARN-2846. Incorrect persist exit code for running containers in reacquireContainer() that interrupted by NodeManager restart. Contributed by Junping Du

This commit is contained in:
Jason Lowe 2014-11-13 16:11:04 +00:00
parent 177e8090f5
commit 33ea5ae92b
4 changed files with 25 additions and 22 deletions

View File

@ -955,6 +955,10 @@ Release 2.6.0 - 2014-11-15
YARN-2794. Fixed log messages about distributing system-credentials. (Jian He via YARN-2794. Fixed log messages about distributing system-credentials. (Jian He via
zjshen) zjshen)
YARN-2846. Incorrect persist exit code for running containers in
reacquireContainer() that interrupted by NodeManager restart. (Junping Du
via jlowe)
Release 2.5.2 - 2014-11-10 Release 2.5.2 - 2014-11-10
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -159,9 +159,10 @@ public abstract boolean isContainerProcessAlive(String user, String pid)
* @param containerId The ID of the container to reacquire * @param containerId The ID of the container to reacquire
* @return The exit code of the pre-existing container * @return The exit code of the pre-existing container
* @throws IOException * @throws IOException
* @throws InterruptedException
*/ */
public int reacquireContainer(String user, ContainerId containerId) public int reacquireContainer(String user, ContainerId containerId)
throws IOException { throws IOException, InterruptedException {
Path pidPath = getPidFilePath(containerId); Path pidPath = getPidFilePath(containerId);
if (pidPath == null) { if (pidPath == null) {
LOG.warn(containerId + " is not active, returning terminated error"); LOG.warn(containerId + " is not active, returning terminated error");
@ -175,14 +176,9 @@ public int reacquireContainer(String user, ContainerId containerId)
} }
LOG.info("Reacquiring " + containerId + " with pid " + pid); LOG.info("Reacquiring " + containerId + " with pid " + pid);
try {
while(isContainerProcessAlive(user, pid)) { while(isContainerProcessAlive(user, pid)) {
Thread.sleep(1000); Thread.sleep(1000);
} }
} catch (InterruptedException e) {
throw new IOException("Interrupted while waiting for process " + pid
+ " to exit", e);
}
// wait for exit code file to appear // wait for exit code file to appear
String exitCodeFile = ContainerLaunch.getExitCodeFile(pidPath.toString()); String exitCodeFile = ContainerLaunch.getExitCodeFile(pidPath.toString());
@ -194,12 +190,9 @@ public int reacquireContainer(String user, ContainerId containerId)
LOG.info(containerId + " was deactivated"); LOG.info(containerId + " was deactivated");
return ExitCode.TERMINATED.getExitCode(); return ExitCode.TERMINATED.getExitCode();
} }
try {
Thread.sleep(sleepMsec); Thread.sleep(sleepMsec);
} catch (InterruptedException e) {
throw new IOException(
"Interrupted while waiting for exit code from " + containerId, e);
}
msecLeft -= sleepMsec; msecLeft -= sleepMsec;
} }
if (msecLeft < 0) { if (msecLeft < 0) {

View File

@ -347,7 +347,7 @@ public int launchContainer(Container container,
@Override @Override
public int reacquireContainer(String user, ContainerId containerId) public int reacquireContainer(String user, ContainerId containerId)
throws IOException { throws IOException, InterruptedException {
try { try {
return super.reacquireContainer(user, containerId); return super.reacquireContainer(user, containerId);
} finally { } finally {

View File

@ -73,6 +73,7 @@ public Integer call() {
dispatcher.getEventHandler().handle(new ContainerEvent(containerId, dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
ContainerEventType.CONTAINER_LAUNCHED)); ContainerEventType.CONTAINER_LAUNCHED));
boolean notInterrupted = true;
try { try {
File pidFile = locatePidFile(appIdStr, containerIdStr); File pidFile = locatePidFile(appIdStr, containerIdStr);
if (pidFile != null) { if (pidFile != null) {
@ -85,7 +86,11 @@ public Integer call() {
} }
} catch (IOException e) { } catch (IOException e) {
LOG.error("Unable to recover container " + containerIdStr, e); LOG.error("Unable to recover container " + containerIdStr, e);
} catch (InterruptedException e) {
LOG.warn("Interrupted while waiting for exit code from " + containerId);
notInterrupted = false;
} finally { } finally {
if (notInterrupted) {
this.completed.set(true); this.completed.set(true);
exec.deactivateContainer(containerId); exec.deactivateContainer(containerId);
try { try {
@ -95,6 +100,7 @@ public Integer call() {
LOG.error("Unable to set exit code for container " + containerId); LOG.error("Unable to set exit code for container " + containerId);
} }
} }
}
if (retCode != 0) { if (retCode != 0) {
LOG.warn("Recovered container exited with a non-zero exit code " LOG.warn("Recovered container exited with a non-zero exit code "