diff --git a/src/plugins/select/bluegene/bg_status.c b/src/plugins/select/bluegene/bg_status.c index d572c56faf12d60702bf2cb54dc2f37d8cab5f2e..46e022eabddd7a3c6aad2476ef5a3ecad13a5423 100644 --- a/src/plugins/select/bluegene/bg_status.c +++ b/src/plugins/select/bluegene/bg_status.c @@ -88,7 +88,6 @@ static int _block_is_deallocating(bg_record_t *bg_record, List kill_job_list) } else if (bg_record->job_list && list_count(bg_record->job_list)) { struct job_record *job_ptr; -// lock_slurmctld(job_read_lock); while ((job_ptr = list_pop(bg_record->job_list))) { select_jobinfo_t *jobinfo; @@ -110,7 +109,6 @@ static int _block_is_deallocating(bg_record_t *bg_record, List kill_job_list) jobinfo->user_name, job_ptr->job_id); } -// unlock_slurmctld(job_read_lock); } else { debug("Block %s was in a ready state " "but is being freed. No job running.", @@ -262,7 +260,6 @@ nochange_state: ListIterator job_itr = list_iterator_create( bg_record->job_list); -// lock_slurmctld(job_read_lock); while ((job_ptr = list_next(job_itr))) { if (job_ptr->magic != JOB_MAGIC) { list_delete_item(job_itr); @@ -270,7 +267,6 @@ nochange_state: } job_ptr->job_state |= JOB_CONFIGURING; } -// unlock_slurmctld(job_read_lock); list_iterator_destroy(job_itr); last_job_update = time(NULL); } @@ -317,7 +313,6 @@ nochange_state: ListIterator job_itr = list_iterator_create( bg_record->job_list); -// lock_slurmctld(job_read_lock); while ((job_ptr = list_next(job_itr))) { if (job_ptr->magic != JOB_MAGIC) { list_delete_item(job_itr); @@ -326,7 +321,6 @@ nochange_state: job_ptr->job_state &= (~JOB_CONFIGURING); } -// unlock_slurmctld(job_read_lock); list_iterator_destroy(job_itr); last_job_update = time(NULL); } diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc index a36b22e742ad327cd9f8b12ccf159fa5f82fb2c3..d988ffdcbaa947c33f1250e3f715be83bf07f40f 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc @@ -608,6 +608,9 @@ static void _do_block_poll(void) if (!bg_lists->main) return; + /* Always lock the slurmctld before locking the + * block_state_mutex to avoid deadlock. */ + lock_slurmctld(job_read_lock); slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_lists->main); while ((bg_record = (bg_record_t *) list_next(itr))) { @@ -637,6 +640,7 @@ static void _do_block_poll(void) updated = 1; } slurm_mutex_unlock(&block_state_mutex); + unlock_slurmctld(job_read_lock); bg_status_process_kill_job_list(kill_job_list); @@ -810,9 +814,13 @@ void event_handler::handleBlockStateChangedRealtimeEvent( if (!bg_lists->main) return; + /* Always lock the slurmctld before locking the + * block_state_mutex to avoid deadlock. */ + lock_slurmctld(job_read_lock); slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_block_id); if (!bg_record) { + unlock_slurmctld(job_read_lock); slurm_mutex_unlock(&block_state_mutex); info("bridge_status: bg_record %s isn't in the main list", bg_block_id); @@ -824,6 +832,7 @@ void event_handler::handleBlockStateChangedRealtimeEvent( kill_job_list); slurm_mutex_unlock(&block_state_mutex); + unlock_slurmctld(job_read_lock); bg_status_process_kill_job_list(kill_job_list);