From ee27ba8943183dffcb0fc8b1055014362f87d497 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Thu, 29 Nov 2012 16:54:17 -0800 Subject: [PATCH] BGQ - add option to tell bg_requeue_job the slurmctld is locked --- src/plugins/select/bluegene/bg_core.c | 11 +++++++---- src/plugins/select/bluegene/bg_core.h | 3 ++- src/plugins/select/bluegene/bg_job_run.c | 8 ++++---- src/plugins/select/bluegene/bg_record_functions.c | 4 ++-- src/plugins/select/bluegene/bg_status.c | 5 +++-- src/plugins/select/bluegene/bg_status.h | 3 ++- src/plugins/select/bluegene/bl_bgq/bridge_status.cc | 8 ++++---- src/plugins/select/bluegene/select_bluegene.c | 2 +- 8 files changed, 25 insertions(+), 19 deletions(-) diff --git a/src/plugins/select/bluegene/bg_core.c b/src/plugins/select/bluegene/bg_core.c index a6e236dbdec..880bdf224de 100644 --- a/src/plugins/select/bluegene/bg_core.c +++ b/src/plugins/select/bluegene/bg_core.c @@ -321,7 +321,8 @@ extern bool block_mp_passthrough(bg_record_t *bg_record, int mp_bit) } /* block_state_mutex must be unlocked before calling this. */ -extern void bg_requeue_job(uint32_t job_id, bool wait_for_start) +extern void bg_requeue_job(uint32_t job_id, bool wait_for_start, + bool slurmctld_locked) { int rc; slurmctld_lock_t job_write_lock = { @@ -333,13 +334,15 @@ extern void bg_requeue_job(uint32_t job_id, bool wait_for_start) if (wait_for_start) sleep(2); - lock_slurmctld(job_write_lock); + if (!slurmctld_locked) + lock_slurmctld(job_write_lock); if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, false))) { error("Couldn't requeue job %u, failing it: %s", job_id, slurm_strerror(rc)); job_fail(job_id); } - unlock_slurmctld(job_write_lock); + if (!slurmctld_locked) + unlock_slurmctld(job_write_lock); } /* if SLURM_ERROR you will need to fail the job with @@ -585,7 +588,7 @@ extern int free_block_list(uint32_t job_id, List track_list, slurm_mutex_unlock(&block_state_mutex); if (kill_job_list) { - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); list_destroy(kill_job_list); kill_job_list = NULL; } diff --git a/src/plugins/select/bluegene/bg_core.h b/src/plugins/select/bluegene/bg_core.h index 0c6c43b08e7..e61a7157f75 100644 --- a/src/plugins/select/bluegene/bg_core.h +++ b/src/plugins/select/bluegene/bg_core.h @@ -69,7 +69,8 @@ extern bool blocks_overlap(bg_record_t *rec_a, bg_record_t *rec_b); extern bool block_mp_passthrough(bg_record_t *bg_record, int mp_bit); -extern void bg_requeue_job(uint32_t job_id, bool wait_for_start); +extern void bg_requeue_job(uint32_t job_id, bool wait_for_start, + bool slurmctld_locked); /* sort a list of bg_records by size (node count) */ extern void sort_bg_record_inc_size(List records); diff --git a/src/plugins/select/bluegene/bg_job_run.c b/src/plugins/select/bluegene/bg_job_run.c index 58a0ee309be..ffc5f051180 100644 --- a/src/plugins/select/bluegene/bg_job_run.c +++ b/src/plugins/select/bluegene/bg_job_run.c @@ -108,7 +108,7 @@ static int _make_sure_block_still_exists(bg_action_t *bg_action_ptr, "job %u requeueing if possible.", bg_action_ptr->bg_block_id, bg_action_ptr->job_ptr->job_id); - bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1); + bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1, 0); } return 0; } @@ -209,7 +209,7 @@ static void _start_agent(bg_action_t *bg_action_ptr) slurm_mutex_unlock(&block_state_mutex); error("block %s not found in bg_lists->main", bg_action_ptr->bg_block_id); - bg_requeue_job(req_job_id, 1); + bg_requeue_job(req_job_id, 1, 0); return; } @@ -279,7 +279,7 @@ static void _start_agent(bg_action_t *bg_action_ptr) bg_record->modifying = 0; slurm_mutex_unlock(&block_state_mutex); - bg_requeue_job(req_job_id, 0); + bg_requeue_job(req_job_id, 0, 0); return; } @@ -304,7 +304,7 @@ static void _start_agent(bg_action_t *bg_action_ptr) } if (IS_JOB_CONFIGURING(bg_action_ptr->job_ptr)) - bg_requeue_job(req_job_id, 0); + bg_requeue_job(req_job_id, 0, 0); return; } diff --git a/src/plugins/select/bluegene/bg_record_functions.c b/src/plugins/select/bluegene/bg_record_functions.c index fac0a078aa0..34738a7eaa7 100644 --- a/src/plugins/select/bluegene/bg_record_functions.c +++ b/src/plugins/select/bluegene/bg_record_functions.c @@ -562,12 +562,12 @@ extern void requeue_and_error(bg_record_t *bg_record, char *reason) } if (bg_record->job_running > NO_JOB_RUNNING) - bg_requeue_job(bg_record->job_running, 0); + bg_requeue_job(bg_record->job_running, 0, 0); else if (bg_record->job_list) { ListIterator itr = list_iterator_create(bg_record->job_list); struct job_record *job_ptr; while ((job_ptr = list_next(itr))) - bg_requeue_job(job_ptr->job_id, 0); + bg_requeue_job(job_ptr->job_id, 0, 0); list_iterator_destroy(itr); } slurm_mutex_lock(&block_state_mutex); diff --git a/src/plugins/select/bluegene/bg_status.c b/src/plugins/select/bluegene/bg_status.c index 270bcac58df..8226572c77b 100644 --- a/src/plugins/select/bluegene/bg_status.c +++ b/src/plugins/select/bluegene/bg_status.c @@ -356,7 +356,8 @@ extern List bg_status_create_kill_job_list(void) return list_create(_destroy_kill_struct); } -extern void bg_status_process_kill_job_list(List kill_job_list) +extern void bg_status_process_kill_job_list(List kill_job_list, + bool slurmctld_locked) { kill_job_struct_t *freeit = NULL; @@ -366,7 +367,7 @@ extern void bg_status_process_kill_job_list(List kill_job_list) /* kill all the jobs from unexpectedly freed blocks */ while ((freeit = list_pop(kill_job_list))) { debug2("Trying to requeue job %u", freeit->jobid); - bg_requeue_job(freeit->jobid, 0); + bg_requeue_job(freeit->jobid, 0, slurmctld_locked); _destroy_kill_struct(freeit); } } diff --git a/src/plugins/select/bluegene/bg_status.h b/src/plugins/select/bluegene/bg_status.h index 6920aa21986..715d512c95e 100644 --- a/src/plugins/select/bluegene/bg_status.h +++ b/src/plugins/select/bluegene/bg_status.h @@ -48,7 +48,8 @@ extern int bg_status_update_block_state(bg_record_t *bg_record, uint16_t state, List kill_job_list); extern List bg_status_create_kill_job_list(void); -extern void bg_status_process_kill_job_list(List kill_job_list); +extern void bg_status_process_kill_job_list(List kill_job_list, + bool slurmctld_locked); /* defined in the various bridge_status' */ extern int bridge_status_init(void); diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc index 50e4aa01d87..49dca3a9376 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc @@ -786,7 +786,7 @@ static void _do_block_poll(void) slurm_mutex_unlock(&block_state_mutex); unlock_slurmctld(job_read_lock); - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); if (updated == 1) last_bg_update = time(NULL); @@ -967,7 +967,7 @@ static void _do_hardware_poll(int level, uint16_t *coords, if ((ba_mp = coord2ba_mp(coords))) _handle_midplane_update(bgqsys, ba_mp, &delete_list); - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); if (delete_list) { bool delete_it = 0; @@ -1105,7 +1105,7 @@ void event_handler::handleBlockStateChangedRealtimeEvent( slurm_mutex_unlock(&block_state_mutex); unlock_slurmctld(job_read_lock); - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); last_bg_update = time(NULL); } @@ -1337,7 +1337,7 @@ void event_handler::handleNodeStateChangedRealtimeEvent( slurm_mutex_unlock(&block_state_mutex); unlock_slurmctld(job_read_lock); - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); if (delete_list) { /* The only reason blocks are added to this list is if diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index 8e994a94c39..dec84ad2f0f 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -2435,7 +2435,7 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr) if (kill_job_list) { slurm_mutex_unlock(&block_state_mutex); - bg_status_process_kill_job_list(kill_job_list); + bg_status_process_kill_job_list(kill_job_list, 0); list_destroy(kill_job_list); kill_job_list = NULL; slurm_mutex_lock(&block_state_mutex); -- GitLab