diff --git a/NEWS b/NEWS index f2468a1769109d3675f863a79efed3d0deac5d91..07a76fe903a1ad0210fee46b564c02710951af42 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,7 @@ documents those changes that are of interest to users and admins. general code clean-up. -- Add ability to change MaxNodes and ExcNodeList for pending job using scontrol. + -- Purge zombie processes spawned via event triggers. * Changes in SLURM 1.2.6 ======================== diff --git a/src/slurmctld/trigger_mgr.c b/src/slurmctld/trigger_mgr.c index 3076e33b0f7b2e16cb66e6f03869fd49b78bcbff..8fa90cccc78874206e2b3422b0d33fb4af4da846 100644 --- a/src/slurmctld/trigger_mgr.c +++ b/src/slurmctld/trigger_mgr.c @@ -213,6 +213,8 @@ extern int trigger_clear(uid_t uid, trigger_info_msg_t *msg) if (trig_in->user_id && (trig_in->user_id != trig_test->user_id)) continue; + if (trig_test->state == 2) /* wait for proc termination */ + continue; list_delete(trig_iter); rc = SLURM_SUCCESS; } @@ -900,13 +902,32 @@ extern void trigger_process(void) } else if ((trig_in->state == 2) && (difftime(now, trig_in->trig_time) > MAX_PROG_TIME)) { + bool purge; + + if (trig_in->group_id != 0) { + pid_t rc; + + killpg(trig_in->group_id, SIGKILL); + rc = waitpid(trig_in->group_id, NULL, WNOHANG); + if ((rc == trig_in->group_id) + || ((rc == -1) && (errno == ECHILD))) + purge = true; + else + purge = false; + } else /* No PID to wait for */ + purge = true; + + if (purge) { #if _DEBUG - info("purging trigger[%u]", trig_in->trig_id); + info("purging trigger[%u]", trig_in->trig_id); #endif - if (trig_in->group_id != 0) - kill(-trig_in->group_id, SIGKILL); - list_delete(trig_iter); - state_change = true; + list_delete(trig_iter); + state_change = true; + } + } else if (trig_in->state == 2) { + /* Elimiate zombie processes right away. + * Purge trigger entry above MAX_PROG_TIME later */ + waitpid(trig_in->group_id, NULL, WNOHANG); } } list_iterator_destroy(trig_iter);