From 3c40e5e71293b97cae2be2db2224e75604c7510a Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 28 Jul 2008 15:47:38 +0000
Subject: [PATCH] Add jobid/stepid to MESSAGE_TASK_EXIT to address race
 condition when     a job step is cancelled, another is started immediately
 (before the     first one completely terminates) and ports are reused.    
 NOTE: This change requires that SLURM be updated on all nodes of the    
 cluster at the same time. There will be no impact upon currently running    
 jobs (they will ignore the jobid/stepid at the end of the message).

---
 NEWS                             | 8 +++++++-
 slurm/slurm.h.in                 | 2 ++
 src/api/step_launch.c            | 7 +++++++
 src/common/slurm_protocol_pack.c | 8 ++++++--
 src/sattach/sattach.c            | 6 ++++++
 src/slurmd/slurmstepd/mgr.c      | 2 ++
 6 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/NEWS b/NEWS
index cf84d3cdb15..2ed32e45bf0 100644
--- a/NEWS
+++ b/NEWS
@@ -3,7 +3,13 @@ documents those changes that are of interest to users and admins.
 
 * Changes in SLURM 1.3.7
 ========================
-
+ -- Add jobid/stepid to MESSAGE_TASK_EXIT to address race condition when 
+    a job step is cancelled, another is started immediately (before the 
+    first one completely terminates) and ports are reused. 
+    NOTE: This change requires that SLURM be updated on all nodes of the
+    cluster at the same time. There will be no impact upon currently running
+    jobs (they will ignore the jobid/stepid at the end of the message).
+ 
 * Changes in SLURM 1.3.6
 ========================
  -- Add new function to get information for a single job rather than always
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index c48aa5d11ff..622a3f9c67e 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -705,6 +705,8 @@ typedef struct task_ext_msg {
 	uint32_t num_tasks;
 	uint32_t *task_id_list;
 	uint32_t return_code;
+	uint32_t job_id;
+	uint32_t step_id;
 } task_exit_msg_t;
 
 typedef struct srun_ping_msg {
diff --git a/src/api/step_launch.c b/src/api/step_launch.c
index 981fb2da651..566beba232c 100644
--- a/src/api/step_launch.c
+++ b/src/api/step_launch.c
@@ -767,6 +767,13 @@ _exit_handler(struct step_launch_state *sls, slurm_msg_t *exit_msg)
 	task_exit_msg_t *msg = (task_exit_msg_t *) exit_msg->data;
 	int i;
 
+	if ((msg->job_id != sls->mpi_info->jobid) || 
+	    (msg->step_id != sls->mpi_info->stepid)) {
+		debug("Received MESSAGE_TASK_EXIT from wrong job: %u.%u",
+		      msg->job_id, msg->step_id);
+		return;
+	}
+
 	/* Record SIGTERM and SIGKILL termination codes to 
 	 * recognize abnormal termination */
 	if (WIFSIGNALED(msg->return_code)) {
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index 9be1aabca44..deb12c3843e 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -3032,10 +3032,12 @@ static void
 _pack_task_exit_msg(task_exit_msg_t * msg, Buf buffer)
 {
 	xassert(msg != NULL);
-	pack32((uint32_t)msg->return_code, buffer);
-	pack32((uint32_t)msg->num_tasks, buffer);
+	pack32(msg->return_code, buffer);
+	pack32(msg->num_tasks, buffer);
 	pack32_array(msg->task_id_list,
 		     msg->num_tasks, buffer);
+	pack32(msg->job_id, buffer);
+	pack32(msg->step_id, buffer);
 }
 
 static int
@@ -3053,6 +3055,8 @@ _unpack_task_exit_msg(task_exit_msg_t ** msg_ptr, Buf buffer)
 	safe_unpack32_array(&msg->task_id_list, &uint32_tmp, buffer);
 	if (msg->num_tasks != uint32_tmp)
 		goto unpack_error;
+	safe_unpack32(&msg->job_id, buffer);
+	safe_unpack32(&msg->step_id, buffer);
 	return SLURM_SUCCESS;
 
 unpack_error:
diff --git a/src/sattach/sattach.c b/src/sattach/sattach.c
index 988d5970413..fd7df545244 100644
--- a/src/sattach/sattach.c
+++ b/src/sattach/sattach.c
@@ -529,6 +529,12 @@ _exit_handler(message_thread_state_t *mts, slurm_msg_t *exit_msg)
 	int i;
 	int rc;
 
+	if ((msg->job_id != opt.jobid) || (msg->step_id != opt.stepid)) {
+		debug("Received MESSAGE_TASK_EXIT from wrong job: %u.%u",
+		      msg->job_id, msg->step_id);
+		return;
+	}
+
 	pthread_mutex_lock(&mts->lock);
 
 	for (i = 0; i < msg->num_tasks; i++) {
diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c
index b0e2b440ba2..bee73df2df1 100644
--- a/src/slurmd/slurmstepd/mgr.c
+++ b/src/slurmd/slurmstepd/mgr.c
@@ -420,6 +420,8 @@ _send_exit_msg(slurmd_job_t *job, uint32_t *tid, int n, int status)
 	msg.task_id_list = tid;
 	msg.num_tasks    = n;
 	msg.return_code  = status;
+	msg.job_id       = job->jobid;
+	msg.step_id      = job->stepid;
 	slurm_msg_t_init(&resp);
 	resp.data        = &msg;
 	resp.msg_type    = MESSAGE_TASK_EXIT;
-- 
GitLab