YARN-7189. Container-executor doesn't remove Docker containers that error out early. Contributed by Eric Badger
(cherry picked from commit 391ac5cdd2
)
This commit is contained in:
parent
88cb461c87
commit
5ec195edbc
|
@ -1444,7 +1444,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
|
||||||
if (exit_code != 0) {
|
if (exit_code != 0) {
|
||||||
fprintf(ERRORFILE, "Could not create script path\n");
|
fprintf(ERRORFILE, "Could not create script path\n");
|
||||||
fflush(ERRORFILE);
|
fflush(ERRORFILE);
|
||||||
goto cleanup;
|
goto pre_launch_cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(LOGFILE, "Creating local dirs...\n");
|
fprintf(LOGFILE, "Creating local dirs...\n");
|
||||||
|
@ -1455,7 +1455,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
|
||||||
if (exit_code != 0) {
|
if (exit_code != 0) {
|
||||||
fprintf(ERRORFILE, "Could not create local files and directories %d %d\n", container_file_source, cred_file_source);
|
fprintf(ERRORFILE, "Could not create local files and directories %d %d\n", container_file_source, cred_file_source);
|
||||||
fflush(ERRORFILE);
|
fflush(ERRORFILE);
|
||||||
goto cleanup;
|
goto pre_launch_cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
docker_command = construct_docker_command(command_file);
|
docker_command = construct_docker_command(command_file);
|
||||||
|
@ -1467,14 +1467,14 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
|
||||||
exit_code = OUT_OF_MEMORY;
|
exit_code = OUT_OF_MEMORY;
|
||||||
fprintf(ERRORFILE, "Container out of memory");
|
fprintf(ERRORFILE, "Container out of memory");
|
||||||
fflush(ERRORFILE);
|
fflush(ERRORFILE);
|
||||||
goto cleanup;
|
goto pre_launch_cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(LOGFILE, "Changing effective user to root...\n");
|
fprintf(LOGFILE, "Changing effective user to root...\n");
|
||||||
if (change_effective_user(0, user_gid) != 0) {
|
if (change_effective_user(0, user_gid) != 0) {
|
||||||
fprintf(ERRORFILE, "Could not change to effective users %d, %d\n", 0, user_gid);
|
fprintf(ERRORFILE, "Could not change to effective users %d, %d\n", 0, user_gid);
|
||||||
fflush(ERRORFILE);
|
fflush(ERRORFILE);
|
||||||
goto cleanup;
|
goto pre_launch_cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
snprintf(docker_command_with_binary, command_size, "%s %s", docker_binary, docker_command);
|
snprintf(docker_command_with_binary, command_size, "%s %s", docker_binary, docker_command);
|
||||||
|
@ -1487,7 +1487,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
|
||||||
"Could not invoke docker %s.\n", docker_command_with_binary);
|
"Could not invoke docker %s.\n", docker_command_with_binary);
|
||||||
fflush(ERRORFILE);
|
fflush(ERRORFILE);
|
||||||
exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT;
|
exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT;
|
||||||
goto cleanup;
|
goto post_launch_cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
snprintf(docker_inspect_command, command_size,
|
snprintf(docker_inspect_command, command_size,
|
||||||
|
@ -1504,7 +1504,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
|
||||||
"Could not inspect docker to get pid %s.\n", docker_inspect_command);
|
"Could not inspect docker to get pid %s.\n", docker_inspect_command);
|
||||||
fflush(ERRORFILE);
|
fflush(ERRORFILE);
|
||||||
exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT;
|
exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT;
|
||||||
goto cleanup;
|
goto post_launch_cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pid != 0) {
|
if (pid != 0) {
|
||||||
|
@ -1519,7 +1519,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
|
||||||
if (strcmp(*cgroup_ptr, "none") != 0 &&
|
if (strcmp(*cgroup_ptr, "none") != 0 &&
|
||||||
write_pid_to_cgroup_as_root(*cgroup_ptr, pid) != 0) {
|
write_pid_to_cgroup_as_root(*cgroup_ptr, pid) != 0) {
|
||||||
exit_code = WRITE_CGROUP_FAILED;
|
exit_code = WRITE_CGROUP_FAILED;
|
||||||
goto cleanup;
|
goto post_launch_cleanup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1532,7 +1532,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
|
||||||
exit_code = WRITE_PIDFILE_FAILED;
|
exit_code = WRITE_PIDFILE_FAILED;
|
||||||
fprintf(ERRORFILE, "Could not write pid to %s", pid_file);
|
fprintf(ERRORFILE, "Could not write pid to %s", pid_file);
|
||||||
fflush(ERRORFILE);
|
fflush(ERRORFILE);
|
||||||
goto cleanup;
|
goto post_launch_cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
snprintf(docker_wait_command, command_size,
|
snprintf(docker_wait_command, command_size,
|
||||||
|
@ -1578,20 +1578,49 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
post_launch_cleanup:
|
||||||
|
|
||||||
fprintf(LOGFILE, "Removing docker container post-exit...\n");
|
fprintf(LOGFILE, "Removing docker container post-exit...\n");
|
||||||
snprintf(docker_rm_command, command_size,
|
snprintf(docker_rm_command, command_size,
|
||||||
"%s rm %s", docker_binary, container_id);
|
"%s rm %s", docker_binary, container_id);
|
||||||
FILE* rm_docker = popen(docker_rm_command, "w");
|
int rc, i, sleep_time = 1, max_iterations = 5;
|
||||||
if (pclose (rm_docker) != 0)
|
for (i = 0; i < max_iterations; i++) {
|
||||||
{
|
if (i > 0) {
|
||||||
fprintf (ERRORFILE,
|
sleep(sleep_time);
|
||||||
"Could not remove container %s.\n", docker_rm_command);
|
sleep_time *= 2;
|
||||||
fflush(ERRORFILE);
|
}
|
||||||
exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT;
|
FILE* rm_docker = popen(docker_rm_command, "w");
|
||||||
goto cleanup;
|
if (rm_docker == 0) {
|
||||||
|
fprintf(ERRORFILE,
|
||||||
|
"popen() failed: %s\n", strerror(errno));
|
||||||
|
fflush(ERRORFILE);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
rc = pclose(rm_docker);
|
||||||
|
if (rc == -1) {
|
||||||
|
fprintf(ERRORFILE,
|
||||||
|
"pclose() failed: %s\n", strerror(errno));
|
||||||
|
fflush(ERRORFILE);
|
||||||
|
} else if (WIFEXITED(rc)) {
|
||||||
|
if (WEXITSTATUS(rc) == 0) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
fprintf(ERRORFILE,
|
||||||
|
"docker rm command failed with exit status: %d\n", WEXITSTATUS(rc));
|
||||||
|
fflush(ERRORFILE);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup:
|
if (i == max_iterations) {
|
||||||
|
// Tried 5 times and failed.
|
||||||
|
fprintf(ERRORFILE,
|
||||||
|
"Could not remove container after %d tries: %s\n", max_iterations, docker_rm_command);
|
||||||
|
fflush(ERRORFILE);
|
||||||
|
exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT;
|
||||||
|
}
|
||||||
|
|
||||||
|
pre_launch_cleanup:
|
||||||
|
|
||||||
if (exit_code_file != NULL && write_exit_code_file_as_nm(exit_code_file, exit_code) < 0) {
|
if (exit_code_file != NULL && write_exit_code_file_as_nm(exit_code_file, exit_code) < 0) {
|
||||||
fprintf (ERRORFILE,
|
fprintf (ERRORFILE,
|
||||||
|
|
Loading…
Reference in New Issue