From dbf429ee4e81011235d9d2148f207198f1ee79b1 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 4 Sep 2003 16:11:59 +0000
Subject: [PATCH] Kill a job allocation if the response message send to srun
 fails. This prevents an orphan job if srun dies after sending the request or
 the network fails or the authenticaion mechanism fails.

---
 src/slurmctld/proc_req.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 448de6f728c..97c9ad21aa7 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -65,6 +65,7 @@
 
 static void         _fill_ctld_conf(slurm_ctl_conf_t * build_ptr);
 static inline bool 	_is_super_user(uid_t uid);
+static void         _kill_job_on_msg_fail(uint32_t job_id);
 static int          _make_step_cred(struct step_record *step_rec, 
 				    slurm_cred_t *slurm_cred);
 inline static void  _slurm_rpc_allocate_resources(slurm_msg_t * msg);
@@ -271,6 +272,22 @@ static inline bool _is_super_user(uid_t uid)
 		return false;
 }
 
+/* _kill_job_on_msg_fail - The request to create a job record successed, 
+ *	but the reply message to srun failed. We kill the job to avoid 
+ *	leaving it orphaned */
+static void _kill_job_on_msg_fail(uint32_t job_id)
+{
+	/* Locks: Write job, write node */
+	slurmctld_lock_t job_write_lock = { 
+		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
+
+	error("Job allocate response msg send failure, killing JobId=%u",
+		job_id);
+	lock_slurmctld(job_write_lock);
+	job_complete(job_id, 0, false, 0);
+	unlock_slurmctld(job_write_lock);
+}
+
 /* create a credential for a given job step, return error code */
 static int _make_step_cred(struct step_record *step_rec, 
 			   slurm_cred_t *slurm_cred)
@@ -355,7 +372,8 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 		response_msg.msg_type = RESPONSE_RESOURCE_ALLOCATION;
 		response_msg.data = &alloc_msg;
 
-		slurm_send_node_msg(msg->conn_fd, &response_msg);
+		if (slurm_send_node_msg(msg->conn_fd, &response_msg) < 0)
+			_kill_job_on_msg_fail(job_id);
 		(void) dump_all_job_state();
 	} else {	/* allocate error */
 		info("_slurm_rpc_allocate_resources: %s ", 
@@ -459,7 +477,8 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 		response_msg.msg_type = RESPONSE_ALLOCATION_AND_RUN_JOB_STEP;
 		response_msg.data = &alloc_msg;
 
-		slurm_send_node_msg(msg->conn_fd, &response_msg);
+		if (slurm_send_node_msg(msg->conn_fd, &response_msg) < 0)
+			_kill_job_on_msg_fail(job_id);
 		slurm_cred_destroy(slurm_cred);
 #ifdef HAVE_LIBELAN3
 		qsw_free_jobinfo(alloc_msg.qsw_job);
-- 
GitLab