diff --git a/NEWS b/NEWS index b48bab115bc76236d2b772b5a778703b0490855f..277696d0c475449c45eaa80bab0749992f7e37b0 100644 --- a/NEWS +++ b/NEWS @@ -84,6 +84,8 @@ documents those changes that are of interest to users and administrators. -- Do not select_g_step_finish() a SLURM_PENDING_STEP step, as nothing has been allocated for the step yet. -- Fixed race condition in PMIx Fence logic. + -- Prevent slurmctld abort if job is killed or requeued while waiting for + reboot of its allocated compute nodes. * Changes in Slurm 16.05.2 diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 1c2c29648b36c45116c8215cb6387cd863349848..8ba207f6311ccb73a56f7304a17da38ca28ccf4c 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -3656,6 +3656,14 @@ static void *_wait_boot(void *arg) unlock_slurmctld(job_write_lock); return NULL; } + if (IS_JOB_PENDING(job_ptr) || /* Job requeued or killed */ + IS_JOB_FINISHED(job_ptr) || + !job_ptr->node_bitmap) { + verbose("Job %u no longer waiting for node boot", + save_job_id); + unlock_slurmctld(job_write_lock); + return NULL; + } for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { if (!bit_test(job_ptr->node_bitmap, i)) diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 8855d52f0435c307322f966e97ea115a76b38e52..9200025ecbe078dfa8c4326df02da91efc3a3b85 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -2492,8 +2492,7 @@ extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg, node_ptr->reason = xstrdup( "Node unexpectedly rebooted"); } - info("%s: Node %s unexpectedly rebooted boot_time=%u " - "last response=%u", + info("%s: Node %s unexpectedly rebooted boot_time=%u last response=%u", __func__, reg_msg->node_name, (uint32_t)node_ptr->boot_time, (uint32_t)node_ptr->last_response); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 07e7382fdf302ac21bce66cfefa3274141a53ed9..615c7f848b662907656bcf616afab9b2a2880aac 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -2589,7 +2589,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, } /* Request asynchronous launch of a prolog for a - * non batch job. */ + * non-batch job. */ if ((slurmctld_conf.prolog_flags & PROLOG_FLAG_ALLOC) || (slurmctld_conf.prolog_flags & PROLOG_FLAG_CONTAIN)) _launch_prolog(job_ptr);