From 5b8dba9eece59a684b4c0b276ec8dd9146032b2f Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" <mgrondona@llnl.gov> Date: Sat, 19 May 2012 07:24:10 -0700 Subject: [PATCH] slurmstepd: Kill remaining children if fork fails On a failure of fork(2), slurmstepd would print an error and exit, possibly leaving previously forked children waiting. Ensure a better cleanup by killing all active children on fork failure before exiting slurmstepd. --- src/slurmd/slurmstepd/mgr.c | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 359818221b4..5717cc9022a 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -1216,6 +1216,47 @@ static int exec_wait_signal (struct exec_wait_info *e, slurmd_job_t *job) return (0); } +/* + * Send SIGKILL to child in exec_wait_info 'e' + * Returns 0 for success, -1 for failure. + */ +static int exec_wait_kill_child (struct exec_wait_info *e) +{ + if (e->pid < 0) + return (-1); + if (kill (e->pid, SIGKILL) < 0) + return (-1); + e->pid = -1; + return (0); +} + +/* + * Send all children in exec_wait_list SIGKILL. + * Returns 0 for success or < 0 on failure. + */ +static int exec_wait_kill_children (List exec_wait_list) +{ + int rc = 0; + int count; + struct exec_wait_info *e; + ListIterator i; + + if ((count = list_count (exec_wait_list)) == 0) + return (0); + + verbose ("Killing %d remaining child%s", + count, (count > 1 ? "ren" : "")); + + i = list_iterator_create (exec_wait_list); + if (i == NULL) + return error ("exec_wait_kill_children: iterator_create: %m"); + + while ((e = list_next (i))) + rc += exec_wait_kill_child (e); + list_iterator_destroy (i); + return (rc); +} + static void prepare_stdio (slurmd_job_t *job, slurmd_task_info_t *task) { #ifdef HAVE_PTY_H @@ -1347,6 +1388,7 @@ _fork_all_tasks(slurmd_job_t *job, bool *io_initialized) if ((ei = fork_child_with_wait_info (i)) == NULL) { error("child fork: %m"); + exec_wait_kill_children (exec_wait_list); rc = SLURM_ERROR; goto fail4; } else if ((pid = exec_wait_get_pid (ei)) == 0) { /* child */ -- GitLab