From 06074ffbeda43fbcdcd06ce62d414046a3b3356b Mon Sep 17 00:00:00 2001
From: Danny Auble <da@schedmd.com>
Date: Thu, 8 Mar 2012 12:14:39 -0800
Subject: [PATCH] BLUEGENE - fixed potential deadlock issue when a nodeboard
 goes down and people are polling the system at the exact same time.

---
 .../select/bluegene/bg_record_functions.c     | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/plugins/select/bluegene/bg_record_functions.c b/src/plugins/select/bluegene/bg_record_functions.c
index 81f994e8841..9ed5fd28f78 100644
--- a/src/plugins/select/bluegene/bg_record_functions.c
+++ b/src/plugins/select/bluegene/bg_record_functions.c
@@ -942,6 +942,8 @@ extern int down_nodecard(char *mp_name, bitoff_t io_start,
 	static int create_size = NO_VAL;
 	static select_ba_request_t blockreq;
 	int rc = SLURM_SUCCESS;
+	slurmctld_lock_t job_write_lock = {
+		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
 
 	xassert(mp_name);
 
@@ -998,6 +1000,11 @@ extern int down_nodecard(char *mp_name, bitoff_t io_start,
 	tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
 	bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt);
 
+	/* To avoid deadlock we always must lock the slurmctld before
+	   the block_state_mutex.
+	*/
+	if (!slurmctld_locked)
+		lock_slurmctld(job_write_lock);
 	slurm_mutex_lock(&block_state_mutex);
 	itr = list_iterator_create(bg_lists->main);
 	while ((bg_record = list_next(itr))) {
@@ -1008,19 +1015,13 @@ extern int down_nodecard(char *mp_name, bitoff_t io_start,
 			continue;
 
 		if (bg_record->job_running > NO_JOB_RUNNING) {
-			if (slurmctld_locked)
-				job_fail(bg_record->job_running);
-			else
-				slurm_fail_job(bg_record->job_running);
+			job_fail(bg_record->job_running);
 		} else if (bg_record->job_list) {
 			ListIterator job_itr = list_iterator_create(
 				bg_record->job_list);
 			struct job_record *job_ptr;
 			while ((job_ptr = list_next(job_itr))) {
-				if (slurmctld_locked)
-					job_fail(job_ptr->job_id);
-				else
-					slurm_fail_job(job_ptr->job_id);
+				job_fail(job_ptr->job_id);
 			}
 			list_iterator_destroy(job_itr);
 		}
@@ -1043,6 +1044,8 @@ extern int down_nodecard(char *mp_name, bitoff_t io_start,
 	}
 	list_iterator_destroy(itr);
 	slurm_mutex_unlock(&block_state_mutex);
+	if (!slurmctld_locked)
+		unlock_slurmctld(job_write_lock);
 
 	if (bg_conf->layout_mode != LAYOUT_DYNAMIC) {
 		debug3("running non-dynamic mode");
-- 
GitLab