From 48899134d2a77935a821072b5388ab1b1b7b399c Mon Sep 17 00:00:00 2001 From: Eric Payne Date: Wed, 2 Aug 2017 10:59:33 -0500 Subject: [PATCH] YARN-6846. Nodemanager can fail to fully delete application local directories when applications are killed. Contributed by Jason Lowe. --- .../impl/container-executor.c | 39 +++++--- .../test/test-container-executor.c | 89 ++++++++++++++++++- 2 files changed, 116 insertions(+), 12 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c index 5070d62a945..99f7b56a41d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c @@ -1837,7 +1837,7 @@ static int rmdir_as_nm(const char* path) { int user_gid = getegid(); int ret = change_effective_user(nm_uid, nm_gid); if (ret == 0) { - if (rmdir(path) != 0) { + if (rmdir(path) != 0 && errno != ENOENT) { fprintf(LOGFILE, "rmdir of %s failed - %s\n", path, strerror(errno)); ret = -1; } @@ -1882,7 +1882,7 @@ static int unlink_helper(int dirfd, const char *name, int flags) { } else { ret = unlink(name); } - if (ret >= 0) { + if (ret >= 0 || errno == ENOENT) { return 0; } return errno; @@ -1919,7 +1919,7 @@ static int is_symlink_helper(int dirfd, const char *name) static int recursive_unlink_helper(int dirfd, const char *name, const char* fullpath) { - int fd = -1, ret = 0; + int fd = -1, ret = 0, unlink_err = 0; DIR *dfd = NULL; struct stat stat; @@ -1928,6 +1928,10 @@ static int recursive_unlink_helper(int dirfd, const char *name, ret = is_symlink_helper(dirfd, name); if (ret < 0) { // is_symlink_helper failed. + if (ret == -ENOENT) { + ret = 0; + goto done; + } ret = -ret; fprintf(LOGFILE, "is_symlink_helper(%s) failed: %s\n", fullpath, strerror(ret)); @@ -1949,6 +1953,10 @@ static int recursive_unlink_helper(int dirfd, const char *name, if (fd == -EACCES) { ret = chmod_helper(dirfd, name, 0700); if (ret) { + if (ret == ENOENT) { + ret = 0; + goto done; + } fprintf(LOGFILE, "chmod(%s) failed: %s\n", fullpath, strerror(ret)); goto done; } @@ -1956,11 +1964,19 @@ static int recursive_unlink_helper(int dirfd, const char *name, } if (fd < 0) { ret = -fd; + if (ret == ENOENT) { + ret = 0; + goto done; + } fprintf(LOGFILE, "error opening %s: %s\n", fullpath, strerror(ret)); goto done; } if (fstat(fd, &stat) < 0) { ret = errno; + if (ret == ENOENT) { + ret = 0; + goto done; + } fprintf(LOGFILE, "failed to stat %s: %s\n", fullpath, strerror(ret)); goto done; } @@ -1974,6 +1990,10 @@ static int recursive_unlink_helper(int dirfd, const char *name, dfd = fdopendir(fd); if (!dfd) { ret = errno; + if (ret == ENOENT) { + ret = 0; + goto done; + } fprintf(LOGFILE, "fopendir(%s) failed: %s\n", fullpath, strerror(ret)); goto done; } @@ -1985,7 +2005,7 @@ static int recursive_unlink_helper(int dirfd, const char *name, de = readdir(dfd); if (!de) { ret = errno; - if (ret) { + if (ret && ret != ENOENT) { fprintf(LOGFILE, "readdir(%s) failed: %s\n", fullpath, strerror(ret)); goto done; } @@ -2003,10 +2023,10 @@ static int recursive_unlink_helper(int dirfd, const char *name, ret = ENOMEM; goto done; } - ret = recursive_unlink_helper(fd, de->d_name, new_fullpath); + int rc = recursive_unlink_helper(fd, de->d_name, new_fullpath); free(new_fullpath); - if (ret) { - goto done; + if (rc && !unlink_err) { + unlink_err = rc; } } if (dirfd != -1) { @@ -2017,7 +2037,7 @@ static int recursive_unlink_helper(int dirfd, const char *name, } } } - ret = 0; + ret = unlink_err; done: if (fd >= 0) { close(fd); @@ -2048,9 +2068,6 @@ static int delete_path(const char *full_path, return PATH_TO_DELETE_IS_NULL; } ret = recursive_unlink_children(full_path); - if (ret == ENOENT) { - return 0; - } if (ret != 0) { fprintf(LOGFILE, "Error while deleting %s: %d (%s)\n", full_path, ret, strerror(ret)); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c index b7d0e442f03..cf5f119a837 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c @@ -368,7 +368,7 @@ void test_delete_app() { sprintf(buffer, "chmod 000 %s/who/let", container_dir); run(buffer); - // delete container directory + // delete application directory int ret = delete_as_user(yarn_username, app_dir, NULL); if (ret != 0) { printf("FAIL: return code from delete_as_user is %d\n", ret); @@ -390,6 +390,13 @@ void test_delete_app() { printf("FAIL: accidently deleted file %s\n", dont_touch); exit(1); } + // verify attempt to delete a nonexistent directory does not fail + ret = delete_as_user(yarn_username, app_dir, NULL); + if (ret != 0) { + printf("FAIL: return code from delete_as_user is %d\n", ret); + exit(1); + } + free(app_dir); free(container_dir); free(dont_touch); @@ -975,6 +982,83 @@ static void expect_type(const char *path, int mode) { } } +static void test_delete_race_internal() { + char* app_dir = get_app_directory(TEST_ROOT "/local-2", yarn_username, "app_1"); + char* container_dir = get_container_work_directory(TEST_ROOT "/local-2", + yarn_username, "app_1", "container_1"); + char buffer[100000]; + + sprintf(buffer, "mkdir -p %s/a/b/c/d", container_dir); + run(buffer); + int i; + for (i = 0; i < 100; ++i) { + sprintf(buffer, "%s/a/f%d", container_dir, i); + touch_or_die(buffer); + sprintf(buffer, "%s/a/b/f%d", container_dir, i); + touch_or_die(buffer); + sprintf(buffer, "%s/a/b/c/f%d", container_dir, i); + touch_or_die(buffer); + sprintf(buffer, "%s/a/b/c/d/f%d", container_dir, i); + touch_or_die(buffer); + } + + pid_t child = fork(); + if (child == -1) { + printf("FAIL: fork failed\n"); + exit(1); + } else if (child == 0) { + // delete container directory + char * dirs[] = {app_dir, 0}; + int ret = delete_as_user(yarn_username, "container_1" , dirs); + if (ret != 0) { + printf("FAIL: return code from delete_as_user is %d\n", ret); + exit(1); + } + exit(0); + } else { + // delete application directory + int ret = delete_as_user(yarn_username, app_dir, NULL); + int status = 0; + if (waitpid(child, &status, 0) == -1) { + printf("FAIL: waitpid %" PRId64 " failed - %s\n", (int64_t)child, strerror(errno)); + exit(1); + } + if (!WIFEXITED(status)) { + printf("FAIL: child %" PRId64 " didn't exit - %d\n", (int64_t)child, status); + exit(1); + } + if (WEXITSTATUS(status) != 0) { + printf("FAIL: child %" PRId64 " exited with bad status %d\n", + (int64_t)child, WEXITSTATUS(status)); + exit(1); + } + if (ret != 0) { + printf("FAIL: return code from delete_as_user is %d\n", ret); + exit(1); + } + } + + // check to make sure the app directory is gone + if (access(app_dir, R_OK) == 0) { + printf("FAIL: didn't delete the directory - %s\n", app_dir); + exit(1); + } + + free(app_dir); + free(container_dir); +} + +void test_delete_race() { + if (initialize_user(yarn_username, local_dirs)) { + printf("FAIL: failed to initialize user %s\n", yarn_username); + exit(1); + } + int i; + for (i = 0; i < 100; ++i) { + test_delete_race_internal(); + } +} + int recursive_unlink_children(const char *name); void test_recursive_unlink_children() { @@ -1204,6 +1288,9 @@ int main(int argc, char **argv) { printf("\nTesting delete_app()\n"); test_delete_app(); + printf("\nTesting delete race\n"); + test_delete_race(); + printf("\nTesting is_feature_enabled()\n"); test_is_feature_enabled();