diff --git a/src/slurmd/req.c b/src/slurmd/req.c index 8d06fd6c377f73074cb38dbdfe0b4a8651874f7a..f20e86768464281519380478741b012485422ffd 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -433,6 +433,24 @@ _rpc_kill_tasks(slurm_msg_t *msg, slurm_addr *cli_addr) slurm_send_rc_msg(msg, rc); } +static void +_kill_running_session_mgrs(uint32_t jobid, int signum) +{ + List steps = shm_get_steps(); + ListIterator i = list_iterator_create(steps); + job_step_t *s = NULL; + int step_cnt = 0; + + while ((s = list_next(i))) { + if (s->jobid == jobid) { + kill(s->sid, signum); + } + } + list_destroy(steps); + + return step_cnt; +} + /* For the specified job_id: Send SIGXCPU, reply to slurmctld, * sleep(configured kill_wait), then send SIGKILL */ static void @@ -451,7 +469,13 @@ _rpc_timelimit(slurm_msg_t *msg, slurm_addr *cli_addr) return; } - step_cnt = _kill_all_active_steps(req->job_id, SIGXCPU); + /* + * Send SIGXCPU to warn session managers of job steps for this + * job that the job is about to be terminated + */ + _kill_running_session_mgrs(req->job_id, SIGXCPU); + + step_cnt = _kill_all_active_steps(req->job_id, SIGTERM); info("Timeout for job=%u, step_cnt=%d, kill_wait=%u", req->job_id, step_cnt, conf->cf.kill_wait); diff --git a/src/slurmd/smgr.c b/src/slurmd/smgr.c index 8db6bf70e3874830bf9c122cc28f67346bef3c90..6a9d910c2fc4782dc36191522155e2c7939a0cc9 100644 --- a/src/slurmd/smgr.c +++ b/src/slurmd/smgr.c @@ -281,6 +281,12 @@ _exec_task(slurmd_job_t *job, int i) exit(errno); } +static sig_atomic_t timelimit_exceeded = 0; +static +_xcpu_handler() +{ + timelimit_exceeded = 1; +} /* wait for N tasks to exit, reporting exit status back to slurmd mgr @@ -295,12 +301,16 @@ _wait_for_all_tasks(slurmd_job_t *job) int id = 0; int fd = job->fdpair[1]; + xsignal(SIGXCPU, _xcpu_handler); + while (waiting > 0) { int status = 0; pid_t pid; if ((pid = waitpid(0, &status, 0)) < (pid_t) 0) { - if (errno != EINTR) + if ((errno == EINTR) && (timelimit_exceeded)) + error("job exceeded timelimit"); + else error("waitpid: %m"); continue; }