From 9d8ae0f7029ba40446ccf45a481b4ec67c204659 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" <mgrondona@llnl.gov> Date: Fri, 7 Oct 2011 10:31:56 -0700 Subject: [PATCH] slurmstepd: Fix race in run_script_as_user As reported by Sam Lang on slurm-dev, task_epilog scripts are not held before exec, and thus there is a race condition between when the task_epilog is launched and slurmstepd calls slurm_container_add() during which the task_epilog script could either run to completion, or launch other processes that escape any job container defined by configuration. Use the new "exec_wait" api to have the child wait before exec just as is done in fork_all_tasks. Based on an original idea by Sam Lang <samlang@gmail.com>. --- src/slurmd/slurmstepd/mgr.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 59bbf7322b0..a7da99a185d 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -2183,6 +2183,7 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, { int status, rc, opt; pid_t cpid; + struct exec_wait_info *ei; xassert(env); if (path == NULL || path[0] == '\0') @@ -2199,11 +2200,11 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, (slurm_container_create(job) != SLURM_SUCCESS)) error("slurm_container_create: %m"); - if ((cpid = fork()) < 0) { + if ((ei = fork_child_with_wait_info(0)) == NULL) { error ("executing %s: fork: %m", name); return -1; } - if (cpid == 0) { + if ((cpid = exec_wait_get_pid (ei)) == 0) { struct priv_state sprivs; char *argv[2]; @@ -2230,6 +2231,11 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, #else setpgrp(); #endif + /* + * Wait for signal from parent + */ + exec_wait_child_wait_for_parent (ei); + execve(path, argv, env); error("execve(): %m"); exit(127); @@ -2237,6 +2243,11 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, if (slurm_container_add(job, cpid) != SLURM_SUCCESS) error("slurm_container_add: %m"); + + if (exec_wait_signal_child (ei) < 0) + error ("run_script_as_user: Failed to wakeup %s", name); + exec_wait_info_destroy (ei); + if (max_wait < 0) opt = 0; else -- GitLab