From e4c4a8bbaf89f88147eb4e7b23c1307e4657af7b Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 3 Sep 2004 21:08:02 +0000 Subject: [PATCH] Send SIGTERM to batch script before SIGKILL for mpirun to cleanup on BGL. --- NEWS | 2 ++ src/slurmd/req.c | 29 ++++++++++++++++------------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/NEWS b/NEWS index bcb203892e0..ba878f00aae 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 0.4.0-pre3 ============================= + -- Send SIGTERM to batch script before SIGKILL for mpirun cleanup on + Blue Gene/L * Changes in SLURM 0.4.0-pre2 ============================= diff --git a/src/slurmd/req.c b/src/slurmd/req.c index e95321a8120..b84d107d0e6 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -61,7 +61,7 @@ static bool _slurm_authorized_user(uid_t uid); static bool _job_still_running(uint32_t job_id); -static int _kill_all_active_steps(uint32_t jobid, int sig); +static int _kill_all_active_steps(uint32_t jobid, int sig, bool batch); static int _launch_tasks(launch_tasks_request_msg_t *, slurm_addr *); static void _rpc_launch_tasks(slurm_msg_t *, slurm_addr *); static void _rpc_spawn_task(slurm_msg_t *, slurm_addr *); @@ -675,7 +675,7 @@ _rpc_timelimit(slurm_msg_t *msg, slurm_addr *cli_addr) } /* - * Indicate to slurmctld that we've recieved the message + * Indicate to slurmctld that we've received the message */ slurm_send_rc_msg(msg, SLURM_SUCCESS); slurm_close_accepted_conn(msg->conn_fd); @@ -687,7 +687,7 @@ _rpc_timelimit(slurm_msg_t *msg, slurm_addr *cli_addr) */ _kill_running_session_mgrs(req->job_id, SIGXCPU, "SIGXCPU"); - nsteps = _kill_all_active_steps(req->job_id, SIGTERM); + nsteps = _kill_all_active_steps(req->job_id, SIGTERM, false); verbose( "Job %u: timeout: sent SIGTERM to %d active steps", req->job_id, nsteps ); @@ -846,9 +846,15 @@ _rpc_reattach_tasks(slurm_msg_t *msg, slurm_addr *cli) } - +/* + * _kill_all_active_steps - signals all steps of a job + * jobid IN - id of job to signal + * sig IN - signal to send + * batch IN - if true signal batch script, otherwise skip it + * RET count of signaled job steps (plus batch script, if applicable) + */ static int -_kill_all_active_steps(uint32_t jobid, int sig) +_kill_all_active_steps(uint32_t jobid, int sig, bool batch) { List steps = shm_get_steps(); ListIterator i = list_iterator_create(steps); @@ -864,10 +870,7 @@ _kill_all_active_steps(uint32_t jobid, int sig) continue; } - /* XXX? - * We don't send anything but SIGKILL to batch jobs - */ - if ((s->stepid == NO_VAL) && (sig != SIGKILL)) + if ((s->stepid == NO_VAL) && (!batch)) continue; step_cnt++; @@ -972,15 +975,15 @@ _rpc_kill_job(slurm_msg_t *msg, slurm_addr *cli) debug("credential for job %u revoked", req->job_id); } - nsteps = _kill_all_active_steps(req->job_id, SIGTERM); + nsteps = _kill_all_active_steps(req->job_id, SIGTERM, true); /* - * If there are currently no active job steps, and no + * If there are currently no active job steps and no * configured epilog to run, bypass asynchronous reply and * notify slurmctld that we have already completed this * request. */ - if ((nsteps == 0) && !conf->epilog && (msg->conn_fd >= 0)) { + if ((nsteps == 0) && !conf->epilog) { if (msg->conn_fd >= 0) slurm_send_rc_msg(msg, ESLURMD_KILL_JOB_ALREADY_COMPLETE); @@ -1005,7 +1008,7 @@ _rpc_kill_job(slurm_msg_t *msg, slurm_addr *cli) * Check for corpses */ if ( !_pause_for_job_completion (req->job_id, 5) - && _kill_all_active_steps(req->job_id, SIGKILL) ) { + && _kill_all_active_steps(req->job_id, SIGKILL, true) ) { /* * Block until all user processes are complete. */ -- GitLab