YARN-6846. Nodemanager can fail to fully delete application local directories when applications are killed. Contributed by Jason Lowe.
(cherry picked from commit 48899134d2
)
This commit is contained in:
parent
f34c01b92a
commit
686bdc48b8
|
@ -1829,7 +1829,7 @@ static int rmdir_as_nm(const char* path) {
|
||||||
int user_gid = getegid();
|
int user_gid = getegid();
|
||||||
int ret = change_effective_user(nm_uid, nm_gid);
|
int ret = change_effective_user(nm_uid, nm_gid);
|
||||||
if (ret == 0) {
|
if (ret == 0) {
|
||||||
if (rmdir(path) != 0) {
|
if (rmdir(path) != 0 && errno != ENOENT) {
|
||||||
fprintf(LOGFILE, "rmdir of %s failed - %s\n", path, strerror(errno));
|
fprintf(LOGFILE, "rmdir of %s failed - %s\n", path, strerror(errno));
|
||||||
ret = -1;
|
ret = -1;
|
||||||
}
|
}
|
||||||
|
@ -1874,7 +1874,7 @@ static int unlink_helper(int dirfd, const char *name, int flags) {
|
||||||
} else {
|
} else {
|
||||||
ret = unlink(name);
|
ret = unlink(name);
|
||||||
}
|
}
|
||||||
if (ret >= 0) {
|
if (ret >= 0 || errno == ENOENT) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return errno;
|
return errno;
|
||||||
|
@ -1911,7 +1911,7 @@ static int is_symlink_helper(int dirfd, const char *name)
|
||||||
static int recursive_unlink_helper(int dirfd, const char *name,
|
static int recursive_unlink_helper(int dirfd, const char *name,
|
||||||
const char* fullpath)
|
const char* fullpath)
|
||||||
{
|
{
|
||||||
int fd = -1, ret = 0;
|
int fd = -1, ret = 0, unlink_err = 0;
|
||||||
DIR *dfd = NULL;
|
DIR *dfd = NULL;
|
||||||
struct stat stat;
|
struct stat stat;
|
||||||
|
|
||||||
|
@ -1920,6 +1920,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
|
||||||
ret = is_symlink_helper(dirfd, name);
|
ret = is_symlink_helper(dirfd, name);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
// is_symlink_helper failed.
|
// is_symlink_helper failed.
|
||||||
|
if (ret == -ENOENT) {
|
||||||
|
ret = 0;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
ret = -ret;
|
ret = -ret;
|
||||||
fprintf(LOGFILE, "is_symlink_helper(%s) failed: %s\n",
|
fprintf(LOGFILE, "is_symlink_helper(%s) failed: %s\n",
|
||||||
fullpath, strerror(ret));
|
fullpath, strerror(ret));
|
||||||
|
@ -1941,6 +1945,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
|
||||||
if (fd == -EACCES) {
|
if (fd == -EACCES) {
|
||||||
ret = chmod_helper(dirfd, name, 0700);
|
ret = chmod_helper(dirfd, name, 0700);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
|
if (ret == ENOENT) {
|
||||||
|
ret = 0;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
fprintf(LOGFILE, "chmod(%s) failed: %s\n", fullpath, strerror(ret));
|
fprintf(LOGFILE, "chmod(%s) failed: %s\n", fullpath, strerror(ret));
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
@ -1948,11 +1956,19 @@ static int recursive_unlink_helper(int dirfd, const char *name,
|
||||||
}
|
}
|
||||||
if (fd < 0) {
|
if (fd < 0) {
|
||||||
ret = -fd;
|
ret = -fd;
|
||||||
|
if (ret == ENOENT) {
|
||||||
|
ret = 0;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
fprintf(LOGFILE, "error opening %s: %s\n", fullpath, strerror(ret));
|
fprintf(LOGFILE, "error opening %s: %s\n", fullpath, strerror(ret));
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
if (fstat(fd, &stat) < 0) {
|
if (fstat(fd, &stat) < 0) {
|
||||||
ret = errno;
|
ret = errno;
|
||||||
|
if (ret == ENOENT) {
|
||||||
|
ret = 0;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
fprintf(LOGFILE, "failed to stat %s: %s\n", fullpath, strerror(ret));
|
fprintf(LOGFILE, "failed to stat %s: %s\n", fullpath, strerror(ret));
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
@ -1966,6 +1982,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
|
||||||
dfd = fdopendir(fd);
|
dfd = fdopendir(fd);
|
||||||
if (!dfd) {
|
if (!dfd) {
|
||||||
ret = errno;
|
ret = errno;
|
||||||
|
if (ret == ENOENT) {
|
||||||
|
ret = 0;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
fprintf(LOGFILE, "fopendir(%s) failed: %s\n", fullpath, strerror(ret));
|
fprintf(LOGFILE, "fopendir(%s) failed: %s\n", fullpath, strerror(ret));
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
@ -1977,7 +1997,7 @@ static int recursive_unlink_helper(int dirfd, const char *name,
|
||||||
de = readdir(dfd);
|
de = readdir(dfd);
|
||||||
if (!de) {
|
if (!de) {
|
||||||
ret = errno;
|
ret = errno;
|
||||||
if (ret) {
|
if (ret && ret != ENOENT) {
|
||||||
fprintf(LOGFILE, "readdir(%s) failed: %s\n", fullpath, strerror(ret));
|
fprintf(LOGFILE, "readdir(%s) failed: %s\n", fullpath, strerror(ret));
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
@ -1995,10 +2015,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
|
||||||
ret = ENOMEM;
|
ret = ENOMEM;
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
ret = recursive_unlink_helper(fd, de->d_name, new_fullpath);
|
int rc = recursive_unlink_helper(fd, de->d_name, new_fullpath);
|
||||||
free(new_fullpath);
|
free(new_fullpath);
|
||||||
if (ret) {
|
if (rc && !unlink_err) {
|
||||||
goto done;
|
unlink_err = rc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (dirfd != -1) {
|
if (dirfd != -1) {
|
||||||
|
@ -2009,7 +2029,7 @@ static int recursive_unlink_helper(int dirfd, const char *name,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ret = 0;
|
ret = unlink_err;
|
||||||
done:
|
done:
|
||||||
if (fd >= 0) {
|
if (fd >= 0) {
|
||||||
close(fd);
|
close(fd);
|
||||||
|
@ -2040,9 +2060,6 @@ static int delete_path(const char *full_path,
|
||||||
return PATH_TO_DELETE_IS_NULL;
|
return PATH_TO_DELETE_IS_NULL;
|
||||||
}
|
}
|
||||||
ret = recursive_unlink_children(full_path);
|
ret = recursive_unlink_children(full_path);
|
||||||
if (ret == ENOENT) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
fprintf(LOGFILE, "Error while deleting %s: %d (%s)\n",
|
fprintf(LOGFILE, "Error while deleting %s: %d (%s)\n",
|
||||||
full_path, ret, strerror(ret));
|
full_path, ret, strerror(ret));
|
||||||
|
|
|
@ -368,7 +368,7 @@ void test_delete_app() {
|
||||||
sprintf(buffer, "chmod 000 %s/who/let", container_dir);
|
sprintf(buffer, "chmod 000 %s/who/let", container_dir);
|
||||||
run(buffer);
|
run(buffer);
|
||||||
|
|
||||||
// delete container directory
|
// delete application directory
|
||||||
int ret = delete_as_user(yarn_username, app_dir, NULL);
|
int ret = delete_as_user(yarn_username, app_dir, NULL);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
printf("FAIL: return code from delete_as_user is %d\n", ret);
|
printf("FAIL: return code from delete_as_user is %d\n", ret);
|
||||||
|
@ -390,6 +390,13 @@ void test_delete_app() {
|
||||||
printf("FAIL: accidently deleted file %s\n", dont_touch);
|
printf("FAIL: accidently deleted file %s\n", dont_touch);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
// verify attempt to delete a nonexistent directory does not fail
|
||||||
|
ret = delete_as_user(yarn_username, app_dir, NULL);
|
||||||
|
if (ret != 0) {
|
||||||
|
printf("FAIL: return code from delete_as_user is %d\n", ret);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
free(app_dir);
|
free(app_dir);
|
||||||
free(container_dir);
|
free(container_dir);
|
||||||
free(dont_touch);
|
free(dont_touch);
|
||||||
|
@ -975,6 +982,83 @@ static void expect_type(const char *path, int mode) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void test_delete_race_internal() {
|
||||||
|
char* app_dir = get_app_directory(TEST_ROOT "/local-2", yarn_username, "app_1");
|
||||||
|
char* container_dir = get_container_work_directory(TEST_ROOT "/local-2",
|
||||||
|
yarn_username, "app_1", "container_1");
|
||||||
|
char buffer[100000];
|
||||||
|
|
||||||
|
sprintf(buffer, "mkdir -p %s/a/b/c/d", container_dir);
|
||||||
|
run(buffer);
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < 100; ++i) {
|
||||||
|
sprintf(buffer, "%s/a/f%d", container_dir, i);
|
||||||
|
touch_or_die(buffer);
|
||||||
|
sprintf(buffer, "%s/a/b/f%d", container_dir, i);
|
||||||
|
touch_or_die(buffer);
|
||||||
|
sprintf(buffer, "%s/a/b/c/f%d", container_dir, i);
|
||||||
|
touch_or_die(buffer);
|
||||||
|
sprintf(buffer, "%s/a/b/c/d/f%d", container_dir, i);
|
||||||
|
touch_or_die(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
pid_t child = fork();
|
||||||
|
if (child == -1) {
|
||||||
|
printf("FAIL: fork failed\n");
|
||||||
|
exit(1);
|
||||||
|
} else if (child == 0) {
|
||||||
|
// delete container directory
|
||||||
|
char * dirs[] = {app_dir, 0};
|
||||||
|
int ret = delete_as_user(yarn_username, "container_1" , dirs);
|
||||||
|
if (ret != 0) {
|
||||||
|
printf("FAIL: return code from delete_as_user is %d\n", ret);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
exit(0);
|
||||||
|
} else {
|
||||||
|
// delete application directory
|
||||||
|
int ret = delete_as_user(yarn_username, app_dir, NULL);
|
||||||
|
int status = 0;
|
||||||
|
if (waitpid(child, &status, 0) == -1) {
|
||||||
|
printf("FAIL: waitpid %" PRId64 " failed - %s\n", (int64_t)child, strerror(errno));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (!WIFEXITED(status)) {
|
||||||
|
printf("FAIL: child %" PRId64 " didn't exit - %d\n", (int64_t)child, status);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (WEXITSTATUS(status) != 0) {
|
||||||
|
printf("FAIL: child %" PRId64 " exited with bad status %d\n",
|
||||||
|
(int64_t)child, WEXITSTATUS(status));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (ret != 0) {
|
||||||
|
printf("FAIL: return code from delete_as_user is %d\n", ret);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check to make sure the app directory is gone
|
||||||
|
if (access(app_dir, R_OK) == 0) {
|
||||||
|
printf("FAIL: didn't delete the directory - %s\n", app_dir);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(app_dir);
|
||||||
|
free(container_dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_delete_race() {
|
||||||
|
if (initialize_user(yarn_username, local_dirs)) {
|
||||||
|
printf("FAIL: failed to initialize user %s\n", yarn_username);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < 100; ++i) {
|
||||||
|
test_delete_race_internal();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int recursive_unlink_children(const char *name);
|
int recursive_unlink_children(const char *name);
|
||||||
|
|
||||||
void test_recursive_unlink_children() {
|
void test_recursive_unlink_children() {
|
||||||
|
@ -1132,6 +1216,9 @@ int main(int argc, char **argv) {
|
||||||
printf("\nTesting delete_app()\n");
|
printf("\nTesting delete_app()\n");
|
||||||
test_delete_app();
|
test_delete_app();
|
||||||
|
|
||||||
|
printf("\nTesting delete race\n");
|
||||||
|
test_delete_race();
|
||||||
|
|
||||||
printf("\nTesting is_feature_enabled()\n");
|
printf("\nTesting is_feature_enabled()\n");
|
||||||
test_is_feature_enabled();
|
test_is_feature_enabled();
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue