From 2743ffc6cb5f5206ff84e3b77fdbec8f18ab43e0 Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Mon, 10 May 2004 19:21:20 +0000 Subject: [PATCH] o change for KILL_JOB RPC. - send sigterm, pause 5 seconds then send sigkill. --- src/slurmd/req.c | 90 +++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/src/slurmd/req.c b/src/slurmd/req.c index e38b6c83d28..b0bcb0c0d5c 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -76,7 +76,15 @@ static void _rpc_pid2jid(slurm_msg_t *msg, slurm_addr *); static int _rpc_ping(slurm_msg_t *, slurm_addr *); static int _run_prolog(uint32_t jobid, uid_t uid); static int _run_epilog(uint32_t jobid, uid_t uid); -static int _wait_for_procs(uint32_t job_id); + +static bool _pause_for_job_completion (uint32_t jobid, int maxtime); +static int _waiter_init (uint32_t jobid); +static int _waiter_complete (uint32_t jobid); + +/* + * List of threads waiting for jobs to complete + */ +static List waiters; static pthread_mutex_t launch_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -841,6 +849,19 @@ _rpc_kill_job(slurm_msg_t *msg, slurm_addr *cli) return; } + /* + * Initialize a "waiter" thread for this jobid. If another + * thread is already waiting on termination of this job, + * _waiter_init() will return < 0. In this case, just + * notify slurmctld that we recvd the message successfully, + * then exit this thread. + */ + if (_waiter_init (req->job_id) < 0) { + slurm_send_rc_msg (msg, SLURM_SUCCESS); + return; + } + + /* * "revoke" all future credentials for this jobid */ @@ -851,7 +872,7 @@ _rpc_kill_job(slurm_msg_t *msg, slurm_addr *cli) debug("credential for job %u revoked", req->job_id); } - nsteps = _kill_all_active_steps(req->job_id, SIGKILL); + nsteps = _kill_all_active_steps(req->job_id, SIGTERM); /* * If there are currently no active job steps, and no @@ -878,13 +899,15 @@ _rpc_kill_job(slurm_msg_t *msg, slurm_addr *cli) } /* - * Block until all user processes are complete. - * If wait_for_procs returns with an error, then another - * thread is waiting for job to complete. Just exit - * this thread. + * Check for corpses */ - if (_wait_for_procs(req->job_id) < 0) - return; + if ( !_pause_for_job_completion (req->job_id, 5) + && _kill_all_active_steps(req->job_id, SIGKILL) ) { + /* + * Block until all user processes are complete. + */ + _pause_for_job_completion (req->job_id, 0); + } /* * Begin expiration period for cached information about job. @@ -907,6 +930,7 @@ _rpc_kill_job(slurm_msg_t *msg, slurm_addr *cli) done: _epilog_complete(req->job_id, rc); + _waiter_complete(req->job_id); } /* @@ -947,47 +971,43 @@ static void _waiter_destroy(struct waiter *wp) xfree(wp); } -/* - * Wait for session for jobid to expire; Only one thread - * per job will be in this call. - * - * Returns SLURM_SUCCESS when session has exited, - * SLURM_ERROR if there is already a thread waiting on job - */ -static int -_wait_for_procs(uint32_t jobid) +static int _waiter_init (uint32_t jobid) { - ListIterator i; - static List waiters; - struct waiter *wp; - if (!waiters) waiters = list_create((ListDelF) _waiter_destroy); - /* * Exit this thread if another thread is waiting on job */ - i = list_iterator_create(waiters); - if ((wp = list_find(i, (ListFindF) _find_waiter, (void *) &jobid))) + if (list_find_first (waiters, (ListFindF) _find_waiter, &jobid)) return SLURM_ERROR; else list_append(waiters, _waiter_create(jobid)); - list_iterator_destroy(i); - - verbose ("Waiting for job %u to complete", jobid); + return (SLURM_SUCCESS); +} - while (_job_still_running(jobid)) - sleep (1); +static int _waiter_complete (uint32_t jobid) +{ + return (list_delete_all (waiters, (ListFindF) _find_waiter, &jobid)); +} - verbose ("Job %u complete", jobid); +/* + * Like _wait_for_procs(), but only wait for up to maxtime seconds + * + * Returns true if all job + */ +static bool +_pause_for_job_completion (uint32_t jobid, int maxtime) +{ + int sec = 0, rc = 0; - /* - * Delete all waiting threads for this process + while ( ((sec++ < maxtime) || (maxtime == 0)) + && (rc = _job_still_running (jobid))) + sleep (1); + /* + * Return true if job is NOT running */ - list_delete_all(waiters, (ListFindF) _find_waiter, (void *) &jobid); - - return SLURM_SUCCESS; + return (!rc); } static void -- GitLab