diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index a64311995a118534f06dfed3829ecb23960fb0ab..10fb5736e1e93b8f45bd9889b053af6c2c2002ea 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -3676,13 +3676,6 @@ static bitstr_t *_valid_features(struct job_record *job_ptr, return result_bits; } -static int _kill_step(struct step_record *step_ptr, void *arg) -{ - select_g_step_finish(step_ptr, true); - - return SLURM_SUCCESS; -} - /* * re_kill_job - for a given job, deallocate its nodes for a second time, * basically a cleanup for failed deallocate() calls @@ -3699,6 +3692,7 @@ extern void re_kill_job(struct job_record *job_ptr) char *host_str = NULL; static uint32_t last_job_id = 0; struct node_record *node_ptr; + ListIterator step_iterator; #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; #endif @@ -3822,8 +3816,18 @@ extern void re_kill_job(struct job_record *job_ptr) /* On a Cray system this will start the NHC early so it is * able to gather any information it can from the apparent * unkillable processes. + * NOTE: do not do a list_for_each here, that will hold on the list + * lock while processing the entire list which could + * potentially be needed to lock again in + * select_g_step_finish which could potentially call + * post_job_step which calls delete_step_record which locks + * the list to create a list_iterator on the same list and + * could cause deadlock :). */ - list_for_each(job_ptr->step_list, (ListForF)_kill_step, NULL); + step_iterator = list_iterator_create(job_ptr->step_list); + while ((step_ptr = list_next(step_iterator))) + select_g_step_finish(step_ptr, true); + list_iterator_destroy(step_iterator); xfree(host_str); last_job_id = job_ptr->job_id; diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 65a14b7fb124cf93d98074442bfce8661a4405e3..c747ba1829dfeae172a2625c03dd53402227be78 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -4531,6 +4531,10 @@ extern void rebuild_step_bitmaps(struct job_record *job_ptr, list_iterator_destroy (step_iterator); } +/* NOTE: this function will call delete_step_record which will lock + * the job_ptr->step_list so make sure you don't call this if you are + * already holding a lock on that list (list_for_each). + */ extern int post_job_step(struct step_record *step_ptr) { struct job_record *job_ptr = step_ptr->job_ptr;