YARN-6846. Nodemanager can fail to fully delete application local directories when applications are killed. Contributed by Jason Lowe.

This commit is contained in:
Eric Payne 2017-08-02 10:59:33 -05:00
parent 5e4434f628
commit 48899134d2
2 changed files with 116 additions and 12 deletions

View File

@ -1837,7 +1837,7 @@ static int rmdir_as_nm(const char* path) {
int user_gid = getegid();
int ret = change_effective_user(nm_uid, nm_gid);
if (ret == 0) {
if (rmdir(path) != 0) {
if (rmdir(path) != 0 && errno != ENOENT) {
fprintf(LOGFILE, "rmdir of %s failed - %s\n", path, strerror(errno));
ret = -1;
}
@ -1882,7 +1882,7 @@ static int unlink_helper(int dirfd, const char *name, int flags) {
} else {
ret = unlink(name);
}
if (ret >= 0) {
if (ret >= 0 || errno == ENOENT) {
return 0;
}
return errno;
@ -1919,7 +1919,7 @@ static int is_symlink_helper(int dirfd, const char *name)
static int recursive_unlink_helper(int dirfd, const char *name,
const char* fullpath)
{
int fd = -1, ret = 0;
int fd = -1, ret = 0, unlink_err = 0;
DIR *dfd = NULL;
struct stat stat;
@ -1928,6 +1928,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
ret = is_symlink_helper(dirfd, name);
if (ret < 0) {
// is_symlink_helper failed.
if (ret == -ENOENT) {
ret = 0;
goto done;
}
ret = -ret;
fprintf(LOGFILE, "is_symlink_helper(%s) failed: %s\n",
fullpath, strerror(ret));
@ -1949,6 +1953,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
if (fd == -EACCES) {
ret = chmod_helper(dirfd, name, 0700);
if (ret) {
if (ret == ENOENT) {
ret = 0;
goto done;
}
fprintf(LOGFILE, "chmod(%s) failed: %s\n", fullpath, strerror(ret));
goto done;
}
@ -1956,11 +1964,19 @@ static int recursive_unlink_helper(int dirfd, const char *name,
}
if (fd < 0) {
ret = -fd;
if (ret == ENOENT) {
ret = 0;
goto done;
}
fprintf(LOGFILE, "error opening %s: %s\n", fullpath, strerror(ret));
goto done;
}
if (fstat(fd, &stat) < 0) {
ret = errno;
if (ret == ENOENT) {
ret = 0;
goto done;
}
fprintf(LOGFILE, "failed to stat %s: %s\n", fullpath, strerror(ret));
goto done;
}
@ -1974,6 +1990,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
dfd = fdopendir(fd);
if (!dfd) {
ret = errno;
if (ret == ENOENT) {
ret = 0;
goto done;
}
fprintf(LOGFILE, "fopendir(%s) failed: %s\n", fullpath, strerror(ret));
goto done;
}
@ -1985,7 +2005,7 @@ static int recursive_unlink_helper(int dirfd, const char *name,
de = readdir(dfd);
if (!de) {
ret = errno;
if (ret) {
if (ret && ret != ENOENT) {
fprintf(LOGFILE, "readdir(%s) failed: %s\n", fullpath, strerror(ret));
goto done;
}
@ -2003,10 +2023,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
ret = ENOMEM;
goto done;
}
ret = recursive_unlink_helper(fd, de->d_name, new_fullpath);
int rc = recursive_unlink_helper(fd, de->d_name, new_fullpath);
free(new_fullpath);
if (ret) {
goto done;
if (rc && !unlink_err) {
unlink_err = rc;
}
}
if (dirfd != -1) {
@ -2017,7 +2037,7 @@ static int recursive_unlink_helper(int dirfd, const char *name,
}
}
}
ret = 0;
ret = unlink_err;
done:
if (fd >= 0) {
close(fd);
@ -2048,9 +2068,6 @@ static int delete_path(const char *full_path,
return PATH_TO_DELETE_IS_NULL;
}
ret = recursive_unlink_children(full_path);
if (ret == ENOENT) {
return 0;
}
if (ret != 0) {
fprintf(LOGFILE, "Error while deleting %s: %d (%s)\n",
full_path, ret, strerror(ret));

View File

@ -368,7 +368,7 @@ void test_delete_app() {
sprintf(buffer, "chmod 000 %s/who/let", container_dir);
run(buffer);
// delete container directory
// delete application directory
int ret = delete_as_user(yarn_username, app_dir, NULL);
if (ret != 0) {
printf("FAIL: return code from delete_as_user is %d\n", ret);
@ -390,6 +390,13 @@ void test_delete_app() {
printf("FAIL: accidently deleted file %s\n", dont_touch);
exit(1);
}
// verify attempt to delete a nonexistent directory does not fail
ret = delete_as_user(yarn_username, app_dir, NULL);
if (ret != 0) {
printf("FAIL: return code from delete_as_user is %d\n", ret);
exit(1);
}
free(app_dir);
free(container_dir);
free(dont_touch);
@ -975,6 +982,83 @@ static void expect_type(const char *path, int mode) {
}
}
static void test_delete_race_internal() {
char* app_dir = get_app_directory(TEST_ROOT "/local-2", yarn_username, "app_1");
char* container_dir = get_container_work_directory(TEST_ROOT "/local-2",
yarn_username, "app_1", "container_1");
char buffer[100000];
sprintf(buffer, "mkdir -p %s/a/b/c/d", container_dir);
run(buffer);
int i;
for (i = 0; i < 100; ++i) {
sprintf(buffer, "%s/a/f%d", container_dir, i);
touch_or_die(buffer);
sprintf(buffer, "%s/a/b/f%d", container_dir, i);
touch_or_die(buffer);
sprintf(buffer, "%s/a/b/c/f%d", container_dir, i);
touch_or_die(buffer);
sprintf(buffer, "%s/a/b/c/d/f%d", container_dir, i);
touch_or_die(buffer);
}
pid_t child = fork();
if (child == -1) {
printf("FAIL: fork failed\n");
exit(1);
} else if (child == 0) {
// delete container directory
char * dirs[] = {app_dir, 0};
int ret = delete_as_user(yarn_username, "container_1" , dirs);
if (ret != 0) {
printf("FAIL: return code from delete_as_user is %d\n", ret);
exit(1);
}
exit(0);
} else {
// delete application directory
int ret = delete_as_user(yarn_username, app_dir, NULL);
int status = 0;
if (waitpid(child, &status, 0) == -1) {
printf("FAIL: waitpid %" PRId64 " failed - %s\n", (int64_t)child, strerror(errno));
exit(1);
}
if (!WIFEXITED(status)) {
printf("FAIL: child %" PRId64 " didn't exit - %d\n", (int64_t)child, status);
exit(1);
}
if (WEXITSTATUS(status) != 0) {
printf("FAIL: child %" PRId64 " exited with bad status %d\n",
(int64_t)child, WEXITSTATUS(status));
exit(1);
}
if (ret != 0) {
printf("FAIL: return code from delete_as_user is %d\n", ret);
exit(1);
}
}
// check to make sure the app directory is gone
if (access(app_dir, R_OK) == 0) {
printf("FAIL: didn't delete the directory - %s\n", app_dir);
exit(1);
}
free(app_dir);
free(container_dir);
}
void test_delete_race() {
if (initialize_user(yarn_username, local_dirs)) {
printf("FAIL: failed to initialize user %s\n", yarn_username);
exit(1);
}
int i;
for (i = 0; i < 100; ++i) {
test_delete_race_internal();
}
}
int recursive_unlink_children(const char *name);
void test_recursive_unlink_children() {
@ -1204,6 +1288,9 @@ int main(int argc, char **argv) {
printf("\nTesting delete_app()\n");
test_delete_app();
printf("\nTesting delete race\n");
test_delete_race();
printf("\nTesting is_feature_enabled()\n");
test_is_feature_enabled();