From 5ec195edbcd982a3e7c2a4ea760e3ce860c87143 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 17 Apr 2018 09:45:55 -0500 Subject: [PATCH] YARN-7189. Container-executor doesn't remove Docker containers that error out early. Contributed by Eric Badger (cherry picked from commit 391ac5cdd2f31db2341bb731daee094b9ca309ec) --- .../impl/container-executor.c | 63 ++++++++++++++----- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c index c1a42ca7a50..109ff7384e6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c @@ -1444,7 +1444,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, if (exit_code != 0) { fprintf(ERRORFILE, "Could not create script path\n"); fflush(ERRORFILE); - goto cleanup; + goto pre_launch_cleanup; } fprintf(LOGFILE, "Creating local dirs...\n"); @@ -1455,7 +1455,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, if (exit_code != 0) { fprintf(ERRORFILE, "Could not create local files and directories %d %d\n", container_file_source, cred_file_source); fflush(ERRORFILE); - goto cleanup; + goto pre_launch_cleanup; } docker_command = construct_docker_command(command_file); @@ -1467,14 +1467,14 @@ int launch_docker_container_as_user(const char * user, const char *app_id, exit_code = OUT_OF_MEMORY; fprintf(ERRORFILE, "Container out of memory"); fflush(ERRORFILE); - goto cleanup; + goto pre_launch_cleanup; } fprintf(LOGFILE, "Changing effective user to root...\n"); if (change_effective_user(0, user_gid) != 0) { fprintf(ERRORFILE, "Could not change to effective users %d, %d\n", 0, user_gid); fflush(ERRORFILE); - goto cleanup; + goto pre_launch_cleanup; } snprintf(docker_command_with_binary, command_size, "%s %s", docker_binary, docker_command); @@ -1487,7 +1487,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, "Could not invoke docker %s.\n", docker_command_with_binary); fflush(ERRORFILE); exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT; - goto cleanup; + goto post_launch_cleanup; } snprintf(docker_inspect_command, command_size, @@ -1504,7 +1504,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, "Could not inspect docker to get pid %s.\n", docker_inspect_command); fflush(ERRORFILE); exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT; - goto cleanup; + goto post_launch_cleanup; } if (pid != 0) { @@ -1519,7 +1519,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, if (strcmp(*cgroup_ptr, "none") != 0 && write_pid_to_cgroup_as_root(*cgroup_ptr, pid) != 0) { exit_code = WRITE_CGROUP_FAILED; - goto cleanup; + goto post_launch_cleanup; } } } @@ -1532,7 +1532,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, exit_code = WRITE_PIDFILE_FAILED; fprintf(ERRORFILE, "Could not write pid to %s", pid_file); fflush(ERRORFILE); - goto cleanup; + goto post_launch_cleanup; } snprintf(docker_wait_command, command_size, @@ -1578,20 +1578,49 @@ int launch_docker_container_as_user(const char * user, const char *app_id, } } +post_launch_cleanup: + fprintf(LOGFILE, "Removing docker container post-exit...\n"); snprintf(docker_rm_command, command_size, "%s rm %s", docker_binary, container_id); - FILE* rm_docker = popen(docker_rm_command, "w"); - if (pclose (rm_docker) != 0) - { - fprintf (ERRORFILE, - "Could not remove container %s.\n", docker_rm_command); - fflush(ERRORFILE); - exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT; - goto cleanup; + int rc, i, sleep_time = 1, max_iterations = 5; + for (i = 0; i < max_iterations; i++) { + if (i > 0) { + sleep(sleep_time); + sleep_time *= 2; + } + FILE* rm_docker = popen(docker_rm_command, "w"); + if (rm_docker == 0) { + fprintf(ERRORFILE, + "popen() failed: %s\n", strerror(errno)); + fflush(ERRORFILE); + continue; + } + rc = pclose(rm_docker); + if (rc == -1) { + fprintf(ERRORFILE, + "pclose() failed: %s\n", strerror(errno)); + fflush(ERRORFILE); + } else if (WIFEXITED(rc)) { + if (WEXITSTATUS(rc) == 0) { + break; + } else { + fprintf(ERRORFILE, + "docker rm command failed with exit status: %d\n", WEXITSTATUS(rc)); + fflush(ERRORFILE); + } + } } -cleanup: + if (i == max_iterations) { + // Tried 5 times and failed. + fprintf(ERRORFILE, + "Could not remove container after %d tries: %s\n", max_iterations, docker_rm_command); + fflush(ERRORFILE); + exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT; + } + +pre_launch_cleanup: if (exit_code_file != NULL && write_exit_code_file_as_nm(exit_code_file, exit_code) < 0) { fprintf (ERRORFILE,