From ee27ba8943183dffcb0fc8b1055014362f87d497 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@schedmd.com>
Date: Thu, 29 Nov 2012 16:54:17 -0800
Subject: [PATCH] BGQ - add option to tell bg_requeue_job the slurmctld is
 locked

---
 src/plugins/select/bluegene/bg_core.c               | 11 +++++++----
 src/plugins/select/bluegene/bg_core.h               |  3 ++-
 src/plugins/select/bluegene/bg_job_run.c            |  8 ++++----
 src/plugins/select/bluegene/bg_record_functions.c   |  4 ++--
 src/plugins/select/bluegene/bg_status.c             |  5 +++--
 src/plugins/select/bluegene/bg_status.h             |  3 ++-
 src/plugins/select/bluegene/bl_bgq/bridge_status.cc |  8 ++++----
 src/plugins/select/bluegene/select_bluegene.c       |  2 +-
 8 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/plugins/select/bluegene/bg_core.c b/src/plugins/select/bluegene/bg_core.c
index a6e236dbdec..880bdf224de 100644
--- a/src/plugins/select/bluegene/bg_core.c
+++ b/src/plugins/select/bluegene/bg_core.c
@@ -321,7 +321,8 @@ extern bool block_mp_passthrough(bg_record_t *bg_record, int mp_bit)
 }
 
 /* block_state_mutex must be unlocked before calling this. */
-extern void bg_requeue_job(uint32_t job_id, bool wait_for_start)
+extern void bg_requeue_job(uint32_t job_id, bool wait_for_start,
+			   bool slurmctld_locked)
 {
 	int rc;
 	slurmctld_lock_t job_write_lock = {
@@ -333,13 +334,15 @@ extern void bg_requeue_job(uint32_t job_id, bool wait_for_start)
 	if (wait_for_start)
 		sleep(2);
 
-	lock_slurmctld(job_write_lock);
+	if (!slurmctld_locked)
+		lock_slurmctld(job_write_lock);
 	if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, false))) {
 		error("Couldn't requeue job %u, failing it: %s",
 		      job_id, slurm_strerror(rc));
 		job_fail(job_id);
 	}
-	unlock_slurmctld(job_write_lock);
+	if (!slurmctld_locked)
+		unlock_slurmctld(job_write_lock);
 }
 
 /* if SLURM_ERROR you will need to fail the job with
@@ -585,7 +588,7 @@ extern int free_block_list(uint32_t job_id, List track_list,
 	slurm_mutex_unlock(&block_state_mutex);
 
 	if (kill_job_list) {
-		bg_status_process_kill_job_list(kill_job_list);
+		bg_status_process_kill_job_list(kill_job_list, 0);
 		list_destroy(kill_job_list);
 		kill_job_list = NULL;
 	}
diff --git a/src/plugins/select/bluegene/bg_core.h b/src/plugins/select/bluegene/bg_core.h
index 0c6c43b08e7..e61a7157f75 100644
--- a/src/plugins/select/bluegene/bg_core.h
+++ b/src/plugins/select/bluegene/bg_core.h
@@ -69,7 +69,8 @@
 
 extern bool blocks_overlap(bg_record_t *rec_a, bg_record_t *rec_b);
 extern bool block_mp_passthrough(bg_record_t *bg_record, int mp_bit);
-extern void bg_requeue_job(uint32_t job_id, bool wait_for_start);
+extern void bg_requeue_job(uint32_t job_id, bool wait_for_start,
+			   bool slurmctld_locked);
 
 /* sort a list of bg_records by size (node count) */
 extern void sort_bg_record_inc_size(List records);
diff --git a/src/plugins/select/bluegene/bg_job_run.c b/src/plugins/select/bluegene/bg_job_run.c
index 58a0ee309be..ffc5f051180 100644
--- a/src/plugins/select/bluegene/bg_job_run.c
+++ b/src/plugins/select/bluegene/bg_job_run.c
@@ -108,7 +108,7 @@ static int _make_sure_block_still_exists(bg_action_t *bg_action_ptr,
 			      "job %u requeueing if possible.",
 			      bg_action_ptr->bg_block_id,
 			      bg_action_ptr->job_ptr->job_id);
-			bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1);
+			bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1, 0);
 		}
 		return 0;
 	}
@@ -209,7 +209,7 @@ static void _start_agent(bg_action_t *bg_action_ptr)
 		slurm_mutex_unlock(&block_state_mutex);
 		error("block %s not found in bg_lists->main",
 		      bg_action_ptr->bg_block_id);
-		bg_requeue_job(req_job_id, 1);
+		bg_requeue_job(req_job_id, 1, 0);
 		return;
 	}
 
@@ -279,7 +279,7 @@ static void _start_agent(bg_action_t *bg_action_ptr)
 
 		bg_record->modifying = 0;
 		slurm_mutex_unlock(&block_state_mutex);
-		bg_requeue_job(req_job_id, 0);
+		bg_requeue_job(req_job_id, 0, 0);
 		return;
 	}
 
@@ -304,7 +304,7 @@ static void _start_agent(bg_action_t *bg_action_ptr)
 		}
 
 		if (IS_JOB_CONFIGURING(bg_action_ptr->job_ptr))
-			bg_requeue_job(req_job_id, 0);
+			bg_requeue_job(req_job_id, 0, 0);
 		return;
 	}
 
diff --git a/src/plugins/select/bluegene/bg_record_functions.c b/src/plugins/select/bluegene/bg_record_functions.c
index fac0a078aa0..34738a7eaa7 100644
--- a/src/plugins/select/bluegene/bg_record_functions.c
+++ b/src/plugins/select/bluegene/bg_record_functions.c
@@ -562,12 +562,12 @@ extern void requeue_and_error(bg_record_t *bg_record, char *reason)
 	}
 
 	if (bg_record->job_running > NO_JOB_RUNNING)
-		bg_requeue_job(bg_record->job_running, 0);
+		bg_requeue_job(bg_record->job_running, 0, 0);
 	else if (bg_record->job_list) {
 		ListIterator itr = list_iterator_create(bg_record->job_list);
 		struct job_record *job_ptr;
 		while ((job_ptr = list_next(itr)))
-			bg_requeue_job(job_ptr->job_id, 0);
+			bg_requeue_job(job_ptr->job_id, 0, 0);
 		list_iterator_destroy(itr);
 	}
 	slurm_mutex_lock(&block_state_mutex);
diff --git a/src/plugins/select/bluegene/bg_status.c b/src/plugins/select/bluegene/bg_status.c
index 270bcac58df..8226572c77b 100644
--- a/src/plugins/select/bluegene/bg_status.c
+++ b/src/plugins/select/bluegene/bg_status.c
@@ -356,7 +356,8 @@ extern List bg_status_create_kill_job_list(void)
 	return list_create(_destroy_kill_struct);
 }
 
-extern void bg_status_process_kill_job_list(List kill_job_list)
+extern void bg_status_process_kill_job_list(List kill_job_list,
+					    bool slurmctld_locked)
 {
 	kill_job_struct_t *freeit = NULL;
 
@@ -366,7 +367,7 @@ extern void bg_status_process_kill_job_list(List kill_job_list)
 	/* kill all the jobs from unexpectedly freed blocks */
 	while ((freeit = list_pop(kill_job_list))) {
 		debug2("Trying to requeue job %u", freeit->jobid);
-		bg_requeue_job(freeit->jobid, 0);
+		bg_requeue_job(freeit->jobid, 0, slurmctld_locked);
 		_destroy_kill_struct(freeit);
 	}
 }
diff --git a/src/plugins/select/bluegene/bg_status.h b/src/plugins/select/bluegene/bg_status.h
index 6920aa21986..715d512c95e 100644
--- a/src/plugins/select/bluegene/bg_status.h
+++ b/src/plugins/select/bluegene/bg_status.h
@@ -48,7 +48,8 @@ extern int bg_status_update_block_state(bg_record_t *bg_record,
 					uint16_t state,
 					List kill_job_list);
 extern List bg_status_create_kill_job_list(void);
-extern void bg_status_process_kill_job_list(List kill_job_list);
+extern void bg_status_process_kill_job_list(List kill_job_list,
+					    bool slurmctld_locked);
 
 /* defined in the various bridge_status' */
 extern int bridge_status_init(void);
diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
index 50e4aa01d87..49dca3a9376 100644
--- a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
+++ b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
@@ -786,7 +786,7 @@ static void _do_block_poll(void)
 	slurm_mutex_unlock(&block_state_mutex);
 	unlock_slurmctld(job_read_lock);
 
-	bg_status_process_kill_job_list(kill_job_list);
+	bg_status_process_kill_job_list(kill_job_list, 0);
 
 	if (updated == 1)
 		last_bg_update = time(NULL);
@@ -967,7 +967,7 @@ static void _do_hardware_poll(int level, uint16_t *coords,
 	if ((ba_mp = coord2ba_mp(coords)))
 		_handle_midplane_update(bgqsys, ba_mp, &delete_list);
 
-	bg_status_process_kill_job_list(kill_job_list);
+	bg_status_process_kill_job_list(kill_job_list, 0);
 
 	if (delete_list) {
 		bool delete_it = 0;
@@ -1105,7 +1105,7 @@ void event_handler::handleBlockStateChangedRealtimeEvent(
 	slurm_mutex_unlock(&block_state_mutex);
 	unlock_slurmctld(job_read_lock);
 
-	bg_status_process_kill_job_list(kill_job_list);
+	bg_status_process_kill_job_list(kill_job_list, 0);
 
 	last_bg_update = time(NULL);
 }
@@ -1337,7 +1337,7 @@ void event_handler::handleNodeStateChangedRealtimeEvent(
 	slurm_mutex_unlock(&block_state_mutex);
 	unlock_slurmctld(job_read_lock);
 
-	bg_status_process_kill_job_list(kill_job_list);
+	bg_status_process_kill_job_list(kill_job_list, 0);
 
 	if (delete_list) {
 		/* The only reason blocks are added to this list is if
diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c
index 8e994a94c39..dec84ad2f0f 100644
--- a/src/plugins/select/bluegene/select_bluegene.c
+++ b/src/plugins/select/bluegene/select_bluegene.c
@@ -2435,7 +2435,7 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr)
 
 	if (kill_job_list) {
 		slurm_mutex_unlock(&block_state_mutex);
-		bg_status_process_kill_job_list(kill_job_list);
+		bg_status_process_kill_job_list(kill_job_list, 0);
 		list_destroy(kill_job_list);
 		kill_job_list = NULL;
 		slurm_mutex_lock(&block_state_mutex);
-- 
GitLab