diff --git a/NEWS b/NEWS index cf84d3cdb15f60317cd198167f7913e11235c20b..2ed32e45bf0b57c94eef6022e4e731fd62ef4882 100644 --- a/NEWS +++ b/NEWS @@ -3,7 +3,13 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.3.7 ======================== - + -- Add jobid/stepid to MESSAGE_TASK_EXIT to address race condition when + a job step is cancelled, another is started immediately (before the + first one completely terminates) and ports are reused. + NOTE: This change requires that SLURM be updated on all nodes of the + cluster at the same time. There will be no impact upon currently running + jobs (they will ignore the jobid/stepid at the end of the message). + * Changes in SLURM 1.3.6 ======================== -- Add new function to get information for a single job rather than always diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index c48aa5d11ffd70518563ec4b66f8489d603fe411..622a3f9c67e49d270ac78f70698996846dcd8867 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -705,6 +705,8 @@ typedef struct task_ext_msg { uint32_t num_tasks; uint32_t *task_id_list; uint32_t return_code; + uint32_t job_id; + uint32_t step_id; } task_exit_msg_t; typedef struct srun_ping_msg { diff --git a/src/api/step_launch.c b/src/api/step_launch.c index 981fb2da6513528c52008b16702b55960eb20f3b..566beba232c6a03d03ab69b238d4c69c54b18434 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -767,6 +767,13 @@ _exit_handler(struct step_launch_state *sls, slurm_msg_t *exit_msg) task_exit_msg_t *msg = (task_exit_msg_t *) exit_msg->data; int i; + if ((msg->job_id != sls->mpi_info->jobid) || + (msg->step_id != sls->mpi_info->stepid)) { + debug("Received MESSAGE_TASK_EXIT from wrong job: %u.%u", + msg->job_id, msg->step_id); + return; + } + /* Record SIGTERM and SIGKILL termination codes to * recognize abnormal termination */ if (WIFSIGNALED(msg->return_code)) { diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 9be1aabca44cee3585049fb9847c7966b209ee7d..deb12c3843ee8b806b62747b9049d405ff276c63 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -3032,10 +3032,12 @@ static void _pack_task_exit_msg(task_exit_msg_t * msg, Buf buffer) { xassert(msg != NULL); - pack32((uint32_t)msg->return_code, buffer); - pack32((uint32_t)msg->num_tasks, buffer); + pack32(msg->return_code, buffer); + pack32(msg->num_tasks, buffer); pack32_array(msg->task_id_list, msg->num_tasks, buffer); + pack32(msg->job_id, buffer); + pack32(msg->step_id, buffer); } static int @@ -3053,6 +3055,8 @@ _unpack_task_exit_msg(task_exit_msg_t ** msg_ptr, Buf buffer) safe_unpack32_array(&msg->task_id_list, &uint32_tmp, buffer); if (msg->num_tasks != uint32_tmp) goto unpack_error; + safe_unpack32(&msg->job_id, buffer); + safe_unpack32(&msg->step_id, buffer); return SLURM_SUCCESS; unpack_error: diff --git a/src/sattach/sattach.c b/src/sattach/sattach.c index 988d5970413225dede89d6d68bb365435349dc96..fd7df5452444d5182b281121cfe1cb4921baf843 100644 --- a/src/sattach/sattach.c +++ b/src/sattach/sattach.c @@ -529,6 +529,12 @@ _exit_handler(message_thread_state_t *mts, slurm_msg_t *exit_msg) int i; int rc; + if ((msg->job_id != opt.jobid) || (msg->step_id != opt.stepid)) { + debug("Received MESSAGE_TASK_EXIT from wrong job: %u.%u", + msg->job_id, msg->step_id); + return; + } + pthread_mutex_lock(&mts->lock); for (i = 0; i < msg->num_tasks; i++) { diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index b0e2b440ba28afff8d5c75df7acc58706d98b6f9..bee73df2df1874541bf4f06c5fa87a0aa4979b8d 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -420,6 +420,8 @@ _send_exit_msg(slurmd_job_t *job, uint32_t *tid, int n, int status) msg.task_id_list = tid; msg.num_tasks = n; msg.return_code = status; + msg.job_id = job->jobid; + msg.step_id = job->stepid; slurm_msg_t_init(&resp); resp.data = &msg; resp.msg_type = MESSAGE_TASK_EXIT;