From 607adb30b97b4bcc64bcaca8a4e23aaf0f88b041 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Mon, 15 Jun 2009 22:16:40 +0000
Subject: [PATCH] better locking in bluegene

---
 .../select/bluegene/plugin/bg_block_info.c    |  2 +-
 .../select/bluegene/plugin/bg_job_run.c       | 53 ++++++++++---------
 .../bluegene/plugin/bg_record_functions.c     | 11 ++--
 .../select/bluegene/plugin/block_sys.c        |  2 +
 .../select/bluegene/plugin/select_bluegene.c  |  6 ++-
 5 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c
index cbbb6698fa5..ba43680507a 100644
--- a/src/plugins/select/bluegene/plugin/bg_block_info.c
+++ b/src/plugins/select/bluegene/plugin/bg_block_info.c
@@ -176,8 +176,8 @@ extern int block_ready(struct job_record *job_ptr)
 	rc = select_g_select_jobinfo_get(job_ptr->select_jobinfo,
 				  SELECT_JOBDATA_BLOCK_ID, &block_id);
 	if (rc == SLURM_SUCCESS) {
-		bg_record = find_bg_record_in_list(bg_lists->main, block_id);
 		slurm_mutex_lock(&block_state_mutex);
+		bg_record = find_bg_record_in_list(bg_lists->main, block_id);
 		
 		if(bg_record) {
 			if(bg_record->job_running != job_ptr->job_id) {
diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c
index ab69304e59f..29c4951a533 100644
--- a/src/plugins/select/bluegene/plugin/bg_job_run.c
+++ b/src/plugins/select/bluegene/plugin/bg_job_run.c
@@ -297,13 +297,14 @@ static void _sync_agent(bg_update_t *bg_update_ptr)
 {
 	bg_record_t * bg_record = NULL;
 	
+	slurm_mutex_lock(&block_state_mutex);
 	bg_record = find_bg_record_in_list(bg_lists->main,
 					   bg_update_ptr->bg_block_id);
 	if(!bg_record) {
+		slurm_mutex_unlock(&block_state_mutex);
 		error("No block %s", bg_update_ptr->bg_block_id);
 		return;
 	}
-	slurm_mutex_lock(&block_state_mutex);
 
 	last_bg_update = time(NULL);
 	bg_record->job_running = bg_update_ptr->job_ptr->job_id;
@@ -315,13 +316,11 @@ static void _sync_agent(bg_update_t *bg_update_ptr)
 	}
 	if(!block_ptr_exist_in_list(bg_lists->booted, bg_record)) 
 		list_push(bg_lists->booted, bg_record);
-	slurm_mutex_unlock(&block_state_mutex);
 
 	if(bg_record->state == RM_PARTITION_READY) {
 		if(bg_record->user_uid != bg_update_ptr->job_ptr->user_id) {
 			int set_user_rc = SLURM_SUCCESS;
 
-			slurm_mutex_lock(&block_state_mutex);
 			debug("User isn't correct for job %d on %s, "
 			      "fixing...", 
 			      bg_update_ptr->job_ptr->job_id,
@@ -334,7 +333,9 @@ static void _sync_agent(bg_update_t *bg_update_ptr)
 		
 			if(set_user_rc == SLURM_ERROR) 
 				(void) slurm_fail_job(bg_record->job_running);
-		}
+		} else
+			slurm_mutex_unlock(&block_state_mutex);
+
 	} else {
 		if(bg_record->state != RM_PARTITION_CONFIGURING) {
 			error("Block %s isn't ready and isn't "
@@ -344,6 +345,7 @@ static void _sync_agent(bg_update_t *bg_update_ptr)
 			debug("Block %s is booting, job ok",
 			      bg_update_ptr->bg_block_id);
 		}
+		slurm_mutex_unlock(&block_state_mutex);
 		_start_agent(bg_update_ptr);
 	}
 }
@@ -362,10 +364,12 @@ static void _start_agent(bg_update_t *bg_update_ptr)
 
 	slurm_mutex_lock(&job_start_mutex);
 		
+	slurm_mutex_lock(&block_state_mutex);
 	bg_record = find_bg_record_in_list(bg_lists->main, 
 					   bg_update_ptr->bg_block_id);
 
 	if(!bg_record) {
+		slurm_mutex_unlock(&block_state_mutex);
 		error("block %s not found in bg_lists->main",
 		      bg_update_ptr->bg_block_id);
 		/* wait for the slurmd to begin 
@@ -385,7 +389,7 @@ static void _start_agent(bg_update_t *bg_update_ptr)
 		slurm_mutex_unlock(&job_start_mutex);
 		return;
 	}
-	slurm_mutex_lock(&block_state_mutex);
+
 	if(bg_record->job_running <= NO_JOB_RUNNING) {
 		// _reset_block(bg_record); should already happened
 		slurm_mutex_unlock(&block_state_mutex);
@@ -816,10 +820,10 @@ static void _term_agent(bg_update_t *bg_update_ptr)
 #endif
 	
 	/* remove the block's users */
+	slurm_mutex_lock(&block_state_mutex);
 	bg_record = find_bg_record_in_list(bg_lists->main,
 					   bg_update_ptr->bg_block_id);
 	if(bg_record) {
-		slurm_mutex_lock(&block_state_mutex);
 		debug("got the record %s user is %s",
 		      bg_record->bg_block_id,
 		      bg_record->user_name);
@@ -838,10 +842,7 @@ static void _term_agent(bg_update_t *bg_update_ptr)
 				      bg_update_ptr->bg_block_id);
 		}
 
-		_reset_block(bg_record);
-		
-		slurm_mutex_unlock(&block_state_mutex);
-		
+		_reset_block(bg_record);		
 	} else if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
 		debug2("Hopefully we are destroying this block %s "
 		       "since it isn't in the bg_lists->main",
@@ -854,6 +855,7 @@ static void _term_agent(bg_update_t *bg_update_ptr)
 		      "happen outside of that.",
 		      bg_update_ptr->bg_block_id);
 	}
+	slurm_mutex_unlock(&block_state_mutex);
 
 #ifdef HAVE_BG_FILES
 	if ((rc = bridge_free_job_list(job_list)) != STATUS_OK)
@@ -1121,29 +1123,32 @@ extern int start_job(struct job_record *job_ptr)
 				     SELECT_JOBDATA_RAMDISK_IMAGE, 
 				     bg_update_ptr->ramdiskimage);
 	}
+
+	slurm_mutex_lock(&block_state_mutex);
 	bg_record = 
 		find_bg_record_in_list(bg_lists->main, 
 				       bg_update_ptr->bg_block_id);
-	if (bg_record) {
-		slurm_mutex_lock(&block_state_mutex);
-		last_bg_update = time(NULL);
-		job_ptr->num_procs = bg_record->cpu_cnt;
-		job_ptr->total_procs = job_ptr->num_procs;
-		bg_record->job_running = bg_update_ptr->job_ptr->job_id;
-		bg_record->job_ptr = bg_update_ptr->job_ptr;
-		if(!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) {
-			list_push(bg_lists->job_running, bg_record);
-			num_unused_cpus -= bg_record->cpu_cnt;
-		}
-		if(!block_ptr_exist_in_list(bg_lists->booted, bg_record))
-			list_push(bg_lists->booted, bg_record);
+	if (!bg_record) {
 		slurm_mutex_unlock(&block_state_mutex);
-	} else {
 		error("bg_record %s doesn't exist, requested for job (%d)", 
 		      bg_update_ptr->bg_block_id, job_ptr->job_id);
 		_bg_list_del(bg_update_ptr);
 		return SLURM_ERROR;
 	}
+
+	last_bg_update = time(NULL);
+	job_ptr->num_procs = bg_record->cpu_cnt;
+	job_ptr->total_procs = job_ptr->num_procs;
+	bg_record->job_running = bg_update_ptr->job_ptr->job_id;
+	bg_record->job_ptr = bg_update_ptr->job_ptr;
+	if(!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) {
+		list_push(bg_lists->job_running, bg_record);
+		num_unused_cpus -= bg_record->cpu_cnt;
+	}
+	if(!block_ptr_exist_in_list(bg_lists->booted, bg_record))
+		list_push(bg_lists->booted, bg_record);
+	slurm_mutex_unlock(&block_state_mutex);
+
 	info("Queue start of job %u in BG block %s",
 	     job_ptr->job_id, 
 	     bg_update_ptr->bg_block_id);
diff --git a/src/plugins/select/bluegene/plugin/bg_record_functions.c b/src/plugins/select/bluegene/plugin/bg_record_functions.c
index 35be54c0fa4..cacb4dfd0f7 100644
--- a/src/plugins/select/bluegene/plugin/bg_record_functions.c
+++ b/src/plugins/select/bluegene/plugin/bg_record_functions.c
@@ -494,6 +494,9 @@ extern int bg_record_cmpf_inc(bg_record_t* rec_a, bg_record_t* rec_b)
 	return 0;
 }
 
+/* if looking at the main list this should have some nice
+ * block_state_mutex locks around it.
+ */
 extern bg_record_t *find_bg_record_in_list(List my_list, char *bg_block_id)
 {
 	ListIterator itr;
@@ -504,16 +507,14 @@ extern bg_record_t *find_bg_record_in_list(List my_list, char *bg_block_id)
 	if(!bg_block_id)
 		return NULL;
 			
-	slurm_mutex_lock(&block_state_mutex);
 	itr = list_iterator_create(my_list);
-	while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) {
+	while((bg_record = list_next(itr))) {
 		if(bg_record->bg_block_id)
-			if (!strcmp(bg_record->bg_block_id, 
-				    bg_block_id))
+			if(!strcmp(bg_record->bg_block_id, bg_block_id))
 				break;
 	}
 	list_iterator_destroy(itr);
-	slurm_mutex_unlock(&block_state_mutex);
+
 	if(bg_record)
 		return bg_record;
 	else
diff --git a/src/plugins/select/bluegene/plugin/block_sys.c b/src/plugins/select/bluegene/plugin/block_sys.c
index d4d3b8172ad..edbbd3a6a78 100755
--- a/src/plugins/select/bluegene/plugin/block_sys.c
+++ b/src/plugins/select/bluegene/plugin/block_sys.c
@@ -1065,6 +1065,7 @@ extern int load_state_file(List curr_block_list, char *dir_name)
 		/* we only care about the states we need here
 		 * everthing else should have been set up already */
 		if(bg_info_record->state == RM_PARTITION_ERROR) {
+			slurm_mutex_lock(&block_state_mutex);
 			if((bg_record = find_bg_record_in_list(
 				    curr_block_list,
 				    bg_info_record->bg_block_id)))
@@ -1075,6 +1076,7 @@ extern int load_state_file(List curr_block_list, char *dir_name)
 				   around in bg_lists->main.
 				*/
 				bg_record->state = bg_info_record->state;
+			slurm_mutex_unlock(&block_state_mutex);
 		}
 	}
 
diff --git a/src/plugins/select/bluegene/plugin/select_bluegene.c b/src/plugins/select/bluegene/plugin/select_bluegene.c
index 328fbf3c036..65d1b943b29 100644
--- a/src/plugins/select/bluegene/plugin/select_bluegene.c
+++ b/src/plugins/select/bluegene/plugin/select_bluegene.c
@@ -706,11 +706,13 @@ extern int select_p_update_block (update_part_msg_t *part_desc_ptr)
 	time_t now;
 	char reason[128], tmp[64], time_str[32];
 
+	slurm_mutex_lock(&block_state_mutex);
 	bg_record = find_bg_record_in_list(bg_lists->main, part_desc_ptr->name);
-	if(!bg_record)
+	if(!bg_record) {
+		slurm_mutex_unlock(&block_state_mutex);
 		return SLURM_ERROR;
+	}
 
-	slurm_mutex_lock(&block_state_mutex);
 	now = time(NULL);
 	slurm_make_time_str(&now, time_str, sizeof(time_str));
 	snprintf(tmp, sizeof(tmp), "[SLURM@%s]", time_str);
-- 
GitLab