diff --git a/src/plugins/select/bluegene/bg_core.c b/src/plugins/select/bluegene/bg_core.c index a6e236dbdecf108ee3ae3c34cff4ae93c436fcc9..880bdf224de575030268dc2ade21e3604c7253d8 100644 --- a/src/plugins/select/bluegene/bg_core.c +++ b/src/plugins/select/bluegene/bg_core.c @@ -321,7 +321,8 @@ extern bool block_mp_passthrough(bg_record_t *bg_record, int mp_bit) } /* block_state_mutex must be unlocked before calling this. */ -extern void bg_requeue_job(uint32_t job_id, bool wait_for_start) +extern void bg_requeue_job(uint32_t job_id, bool wait_for_start, + bool slurmctld_locked) { int rc; slurmctld_lock_t job_write_lock = { @@ -333,13 +334,15 @@ extern void bg_requeue_job(uint32_t job_id, bool wait_for_start) if (wait_for_start) sleep(2); - lock_slurmctld(job_write_lock); + if (!slurmctld_locked) + lock_slurmctld(job_write_lock); if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, false))) { error("Couldn't requeue job %u, failing it: %s", job_id, slurm_strerror(rc)); job_fail(job_id); } - unlock_slurmctld(job_write_lock); + if (!slurmctld_locked) + unlock_slurmctld(job_write_lock); } /* if SLURM_ERROR you will need to fail the job with @@ -585,7 +588,7 @@ extern int free_block_list(uint32_t job_id, List track_list, slurm_mutex_unlock(&block_state_mutex); if (kill_job_list) { - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); list_destroy(kill_job_list); kill_job_list = NULL; } diff --git a/src/plugins/select/bluegene/bg_core.h b/src/plugins/select/bluegene/bg_core.h index 0c6c43b08e7adaabd933a9e8bf7b006859756cbf..e61a7157f75b88140e904d3de6292ae5c1c75916 100644 --- a/src/plugins/select/bluegene/bg_core.h +++ b/src/plugins/select/bluegene/bg_core.h @@ -69,7 +69,8 @@ extern bool blocks_overlap(bg_record_t *rec_a, bg_record_t *rec_b); extern bool block_mp_passthrough(bg_record_t *bg_record, int mp_bit); -extern void bg_requeue_job(uint32_t job_id, bool wait_for_start); +extern void bg_requeue_job(uint32_t job_id, bool wait_for_start, + bool slurmctld_locked); /* sort a list of bg_records by size (node count) */ extern void sort_bg_record_inc_size(List records); diff --git a/src/plugins/select/bluegene/bg_job_run.c b/src/plugins/select/bluegene/bg_job_run.c index 58a0ee309be6cedef671fd9bb697137ce7896910..ffc5f051180bbcab9474649c8ec2a6331bbdc2a0 100644 --- a/src/plugins/select/bluegene/bg_job_run.c +++ b/src/plugins/select/bluegene/bg_job_run.c @@ -108,7 +108,7 @@ static int _make_sure_block_still_exists(bg_action_t *bg_action_ptr, "job %u requeueing if possible.", bg_action_ptr->bg_block_id, bg_action_ptr->job_ptr->job_id); - bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1); + bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1, 0); } return 0; } @@ -209,7 +209,7 @@ static void _start_agent(bg_action_t *bg_action_ptr) slurm_mutex_unlock(&block_state_mutex); error("block %s not found in bg_lists->main", bg_action_ptr->bg_block_id); - bg_requeue_job(req_job_id, 1); + bg_requeue_job(req_job_id, 1, 0); return; } @@ -279,7 +279,7 @@ static void _start_agent(bg_action_t *bg_action_ptr) bg_record->modifying = 0; slurm_mutex_unlock(&block_state_mutex); - bg_requeue_job(req_job_id, 0); + bg_requeue_job(req_job_id, 0, 0); return; } @@ -304,7 +304,7 @@ static void _start_agent(bg_action_t *bg_action_ptr) } if (IS_JOB_CONFIGURING(bg_action_ptr->job_ptr)) - bg_requeue_job(req_job_id, 0); + bg_requeue_job(req_job_id, 0, 0); return; } diff --git a/src/plugins/select/bluegene/bg_record_functions.c b/src/plugins/select/bluegene/bg_record_functions.c index fac0a078aa032790c066a8aed5d29ad80e6a0940..34738a7eaa771165a47dc9011b7c28ddadeaf57f 100644 --- a/src/plugins/select/bluegene/bg_record_functions.c +++ b/src/plugins/select/bluegene/bg_record_functions.c @@ -562,12 +562,12 @@ extern void requeue_and_error(bg_record_t *bg_record, char *reason) } if (bg_record->job_running > NO_JOB_RUNNING) - bg_requeue_job(bg_record->job_running, 0); + bg_requeue_job(bg_record->job_running, 0, 0); else if (bg_record->job_list) { ListIterator itr = list_iterator_create(bg_record->job_list); struct job_record *job_ptr; while ((job_ptr = list_next(itr))) - bg_requeue_job(job_ptr->job_id, 0); + bg_requeue_job(job_ptr->job_id, 0, 0); list_iterator_destroy(itr); } slurm_mutex_lock(&block_state_mutex); diff --git a/src/plugins/select/bluegene/bg_status.c b/src/plugins/select/bluegene/bg_status.c index 270bcac58df99682bbcc11e5c0ae74bb1967efa9..8226572c77b230e2ea8aebf9112d14fcf2a621fb 100644 --- a/src/plugins/select/bluegene/bg_status.c +++ b/src/plugins/select/bluegene/bg_status.c @@ -356,7 +356,8 @@ extern List bg_status_create_kill_job_list(void) return list_create(_destroy_kill_struct); } -extern void bg_status_process_kill_job_list(List kill_job_list) +extern void bg_status_process_kill_job_list(List kill_job_list, + bool slurmctld_locked) { kill_job_struct_t *freeit = NULL; @@ -366,7 +367,7 @@ extern void bg_status_process_kill_job_list(List kill_job_list) /* kill all the jobs from unexpectedly freed blocks */ while ((freeit = list_pop(kill_job_list))) { debug2("Trying to requeue job %u", freeit->jobid); - bg_requeue_job(freeit->jobid, 0); + bg_requeue_job(freeit->jobid, 0, slurmctld_locked); _destroy_kill_struct(freeit); } } diff --git a/src/plugins/select/bluegene/bg_status.h b/src/plugins/select/bluegene/bg_status.h index 6920aa2198672d5413ca3843bc37fbc5fb84296b..715d512c95e559422219ae9ff5d79ab8e07a47de 100644 --- a/src/plugins/select/bluegene/bg_status.h +++ b/src/plugins/select/bluegene/bg_status.h @@ -48,7 +48,8 @@ extern int bg_status_update_block_state(bg_record_t *bg_record, uint16_t state, List kill_job_list); extern List bg_status_create_kill_job_list(void); -extern void bg_status_process_kill_job_list(List kill_job_list); +extern void bg_status_process_kill_job_list(List kill_job_list, + bool slurmctld_locked); /* defined in the various bridge_status' */ extern int bridge_status_init(void); diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc index 50e4aa01d873bfa93c0b392d37cbf2dfb847fa96..49dca3a9376a541553c44fdc284ef5345facdc61 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc @@ -786,7 +786,7 @@ static void _do_block_poll(void) slurm_mutex_unlock(&block_state_mutex); unlock_slurmctld(job_read_lock); - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); if (updated == 1) last_bg_update = time(NULL); @@ -967,7 +967,7 @@ static void _do_hardware_poll(int level, uint16_t *coords, if ((ba_mp = coord2ba_mp(coords))) _handle_midplane_update(bgqsys, ba_mp, &delete_list); - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); if (delete_list) { bool delete_it = 0; @@ -1105,7 +1105,7 @@ void event_handler::handleBlockStateChangedRealtimeEvent( slurm_mutex_unlock(&block_state_mutex); unlock_slurmctld(job_read_lock); - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); last_bg_update = time(NULL); } @@ -1337,7 +1337,7 @@ void event_handler::handleNodeStateChangedRealtimeEvent( slurm_mutex_unlock(&block_state_mutex); unlock_slurmctld(job_read_lock); - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); if (delete_list) { /* The only reason blocks are added to this list is if diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index 8e994a94c3928b6e49806c43fcba99ac9b846d2d..dec84ad2f0f03af8b68f9eef4f559021a4c75dc6 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -2435,7 +2435,7 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr) if (kill_job_list) { slurm_mutex_unlock(&block_state_mutex); - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); list_destroy(kill_job_list); kill_job_list = NULL; slurm_mutex_lock(&block_state_mutex);