From dbf429ee4e81011235d9d2148f207198f1ee79b1 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 4 Sep 2003 16:11:59 +0000 Subject: [PATCH] Kill a job allocation if the response message send to srun fails. This prevents an orphan job if srun dies after sending the request or the network fails or the authenticaion mechanism fails. --- src/slurmctld/proc_req.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 448de6f728c..97c9ad21aa7 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -65,6 +65,7 @@ static void _fill_ctld_conf(slurm_ctl_conf_t * build_ptr); static inline bool _is_super_user(uid_t uid); +static void _kill_job_on_msg_fail(uint32_t job_id); static int _make_step_cred(struct step_record *step_rec, slurm_cred_t *slurm_cred); inline static void _slurm_rpc_allocate_resources(slurm_msg_t * msg); @@ -271,6 +272,22 @@ static inline bool _is_super_user(uid_t uid) return false; } +/* _kill_job_on_msg_fail - The request to create a job record successed, + * but the reply message to srun failed. We kill the job to avoid + * leaving it orphaned */ +static void _kill_job_on_msg_fail(uint32_t job_id) +{ + /* Locks: Write job, write node */ + slurmctld_lock_t job_write_lock = { + NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; + + error("Job allocate response msg send failure, killing JobId=%u", + job_id); + lock_slurmctld(job_write_lock); + job_complete(job_id, 0, false, 0); + unlock_slurmctld(job_write_lock); +} + /* create a credential for a given job step, return error code */ static int _make_step_cred(struct step_record *step_rec, slurm_cred_t *slurm_cred) @@ -355,7 +372,8 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) response_msg.msg_type = RESPONSE_RESOURCE_ALLOCATION; response_msg.data = &alloc_msg; - slurm_send_node_msg(msg->conn_fd, &response_msg); + if (slurm_send_node_msg(msg->conn_fd, &response_msg) < 0) + _kill_job_on_msg_fail(job_id); (void) dump_all_job_state(); } else { /* allocate error */ info("_slurm_rpc_allocate_resources: %s ", @@ -459,7 +477,8 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg) response_msg.msg_type = RESPONSE_ALLOCATION_AND_RUN_JOB_STEP; response_msg.data = &alloc_msg; - slurm_send_node_msg(msg->conn_fd, &response_msg); + if (slurm_send_node_msg(msg->conn_fd, &response_msg) < 0) + _kill_job_on_msg_fail(job_id); slurm_cred_destroy(slurm_cred); #ifdef HAVE_LIBELAN3 qsw_free_jobinfo(alloc_msg.qsw_job); -- GitLab