From 24f9552c8f99348087e756e1240a043ee53d620e Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Tue, 9 May 2006 20:33:10 +0000
Subject: [PATCH] more memory leaks fixed along with a race condition fix for
 the bluegene plugin

---
 .../block_allocator/block_allocator.c         |   7 +-
 .../select/bluegene/plugin/bg_block_info.c    |  19 +--
 .../select/bluegene/plugin/bg_job_place.c     |  10 ++
 .../select/bluegene/plugin/bg_job_run.c       |   1 +
 .../select/bluegene/plugin/block_sys.c        |   2 +-
 src/plugins/select/bluegene/plugin/bluegene.c |  14 ++-
 .../select/bluegene/plugin/state_test.c       | 113 ++++++++++++++++++
 .../select/bluegene/plugin/state_test.h       |   6 +
 src/slurmd/slurmd/req.c                       |  11 +-
 src/slurmd/slurmd/slurmd.c                    |   1 +
 10 files changed, 162 insertions(+), 22 deletions(-)

diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c
index 3e2e2954891..931b1d7d6bc 100644
--- a/src/plugins/select/bluegene/block_allocator/block_allocator.c
+++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c
@@ -2359,13 +2359,14 @@ start_again:
 				return 1;
 			}
 			
+			remove_block(results, color_count);
+			list_destroy(results);
+			results = list_create(NULL);
 			if(ba_request->start_req) 
 				goto requested_end;
 			//exit(0);
 			debug2("trying something else");
-			remove_block(results, color_count);
-			list_destroy(results);
-			results = list_create(NULL);
+			
 		}
 		
 #ifdef HAVE_BG
diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c
index 55d8eb50f29..edc0c784743 100644
--- a/src/plugins/select/bluegene/plugin/bg_block_info.c
+++ b/src/plugins/select/bluegene/plugin/bg_block_info.c
@@ -63,18 +63,21 @@ static int  _block_is_deallocating(bg_record_t *bg_record);
 
 static int _block_is_deallocating(bg_record_t *bg_record)
 {
+	int jobid = bg_record->job_running;
+	char *user_name = NULL;
 	slurm_conf_lock();
+	user_name = xstrdup(slurmctld_conf.slurm_user_name);
 	if(remove_all_users(bg_record->bg_block_id, NULL) 
 	   == REMOVE_USER_ERR) {
 		error("Something happened removing "
 		      "users from partition %s", 
 		      bg_record->bg_block_id);
 	} 
+	slurm_conf_unlock();
 	
 	if(bg_record->target_name 
 	   && bg_record->user_name) {
-		if(!strcmp(bg_record->target_name, 
-			   slurmctld_conf.slurm_user_name)) {
+		if(!strcmp(bg_record->target_name, user_name)) {
 			if(strcmp(bg_record->target_name, 
 				  bg_record->user_name)) {
 				error("Block %s was in a ready state "
@@ -82,10 +85,10 @@ static int _block_is_deallocating(bg_record_t *bg_record)
 				      "Job %d was lost.",
 				      bg_record->bg_block_id,
 				      bg_record->user_name,
-				      bg_record->job_running);
-				if(bg_record->job_running > -1)
-					slurm_fail_job(bg_record->job_running);
+				      jobid);
 				slurm_mutex_unlock(&block_state_mutex);
+				if(jobid > -1)
+					slurm_fail_job(jobid);
 				if(remove_from_bg_list(bg_job_block_list, 
 						       bg_record) 
 				   == SLURM_SUCCESS) {
@@ -118,12 +121,12 @@ static int _block_is_deallocating(bg_record_t *bg_record)
 		error("Target Name and User Name are "
 		      "not set for partition %s.",
 		      bg_record->bg_block_id);
-		bg_record->user_name = 
-			xstrdup(slurmctld_conf.slurm_user_name);
+		bg_record->user_name = xstrdup(user_name);
 		bg_record->target_name = 
 			xstrdup(bg_record->user_name);
 	}
-	slurm_conf_unlock();
+
+	xfree(user_name);
 			
 	return SLURM_SUCCESS;
 }
diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c
index 47360123220..2986a2c068e 100644
--- a/src/plugins/select/bluegene/plugin/bg_job_place.c
+++ b/src/plugins/select/bluegene/plugin/bg_job_place.c
@@ -434,6 +434,16 @@ try_again:
 	
 	/* set the bitmap and do other allocation activities */
 	if (*found_bg_record) {
+		if(!test_only) {
+			if(check_block_bp_states(
+				   (*found_bg_record)->bg_block_id) 
+			   == SLURM_ERROR) {
+				(*found_bg_record)->job_running = -3;
+				(*found_bg_record)->state = RM_PARTITION_ERROR;
+				slurm_mutex_unlock(&block_state_mutex);
+				goto try_again;
+			}
+		}
 		format_node_name(*found_bg_record, tmp_char);
 	
 		debug("_find_best_block_match %s <%s>", 
diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c
index 6ec2d6f6a50..0624f837011 100644
--- a/src/plugins/select/bluegene/plugin/bg_job_run.c
+++ b/src/plugins/select/bluegene/plugin/bg_job_run.c
@@ -467,6 +467,7 @@ static void _term_agent(bg_update_t *bg_update_ptr)
 			bg_record->job_running = -1;
 		
 		/*remove user from list */
+		
 		slurm_conf_lock();
 		if(bg_record->target_name) {
 			if(strcmp(bg_record->target_name, 
diff --git a/src/plugins/select/bluegene/plugin/block_sys.c b/src/plugins/select/bluegene/plugin/block_sys.c
index 93479e6a47f..b69db173d5e 100755
--- a/src/plugins/select/bluegene/plugin/block_sys.c
+++ b/src/plugins/select/bluegene/plugin/block_sys.c
@@ -564,7 +564,7 @@ int read_bg_blocks()
 			}
 			if ((rc = rm_get_data(bp_ptr, RM_BPID, &bpid))
 			    != STATUS_OK) {
-				error("rm_get_data(RM_BPLoc): %s",
+				error("rm_get_data(RM_BPID): %s",
 				      bg_err_str(rc));
 				rc = SLURM_ERROR;
 				break;
diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c
index 36610b77be2..c25138eacfc 100644
--- a/src/plugins/select/bluegene/plugin/bluegene.c
+++ b/src/plugins/select/bluegene/plugin/bluegene.c
@@ -909,6 +909,7 @@ extern int create_defined_blocks(bg_layout_t overlapped)
 					if(!name) {			
 						debug("I was unable to make "
 						      "the requested block.");
+						list_iterator_destroy(itr);
 						slurm_mutex_unlock(
 							&block_state_mutex);
 						return SLURM_ERROR;
@@ -918,6 +919,7 @@ extern int create_defined_blocks(bg_layout_t overlapped)
 					debug("something happened in the "
 					      "load of %s", 
 					      bg_record->bg_block_id);
+					list_iterator_destroy(itr);
 					slurm_mutex_unlock(
 						&block_state_mutex);
 					return SLURM_ERROR;
@@ -1091,7 +1093,10 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list)
 			num_quarter=4;
 		}
 		if(_breakup_blocks(request, my_block_list, &block_inx) 
-		   == SLURM_SUCCESS)
+		   != SLURM_SUCCESS) {
+			debug2("small block not able to be placed");
+			//rc = SLURM_ERROR;
+		} else 
 			goto finished;
 	}
 	
@@ -1443,7 +1448,9 @@ extern void *mult_free_block(void *args)
 		list_destroy(bg_freeing_list);
 		bg_freeing_list = NULL;
 	}
-	slurm_mutex_unlock(&freed_cnt_mutex);	
+	slurm_mutex_unlock(&freed_cnt_mutex);
+	if(free_cnt == 0)
+		list_destroy(bg_free_block_list);
 	return NULL;
 }
 
@@ -1547,6 +1554,9 @@ extern void *mult_destroy_block(void *args)
 		bg_freeing_list = NULL;
 	}
 	slurm_mutex_unlock(&freed_cnt_mutex);	
+	if(destroy_cnt == 0)
+		list_destroy(bg_destroy_block_list);
+	
 	return NULL;
 }
 
diff --git a/src/plugins/select/bluegene/plugin/state_test.c b/src/plugins/select/bluegene/plugin/state_test.c
index 03833326416..c606eaa4f52 100644
--- a/src/plugins/select/bluegene/plugin/state_test.c
+++ b/src/plugins/select/bluegene/plugin/state_test.c
@@ -336,3 +336,116 @@ extern void test_mmcs_failures(void)
 #endif
 }
 
+extern int check_block_bp_states(char *bg_block_id)
+{
+	int rc = SLURM_SUCCESS;
+#ifdef HAVE_BG_FILES
+	rm_partition_t *block_ptr = NULL;
+	rm_BP_t *bp_ptr = NULL;
+	char *bpid = NULL;
+	int bp_cnt = 0;
+	int i = 0;
+	int *coord = NULL;
+	rm_BP_state_t bp_state;
+	char bg_down_node[128], reason[128];
+	char down_node_list[BUFSIZE];
+	time_t now = time(NULL);
+	struct tm * time_ptr = localtime(&now);
+	
+	down_node_list[0] = '\0';
+	slurm_mutex_lock(&api_file_mutex);
+	if ((rc = rm_get_partition(bg_block_id, &block_ptr)) != STATUS_OK) {
+		error("Partition %s doesn't exist.", bg_block_id);
+		rc = SLURM_ERROR;
+		slurm_mutex_unlock(&api_file_mutex);
+		goto done;
+	}
+	slurm_mutex_unlock(&api_file_mutex);
+	
+	if ((rc = rm_get_data(block_ptr, RM_PartitionBPNum, &bp_cnt)) 
+	    != STATUS_OK) {
+		error("rm_get_data(RM_BPNum): %s", bg_err_str(rc));
+		rc = SLURM_ERROR;
+		goto cleanup;
+	}
+	
+	for(i=0; i<bp_cnt; i++) {
+		if(i) {
+			if ((rc = rm_get_data(block_ptr, 
+					      RM_PartitionNextBP, 
+					      &bp_ptr))
+			    != STATUS_OK) {
+				error("rm_get_data(RM_NextBP): %s",
+				      bg_err_str(rc));
+				rc = SLURM_ERROR;
+				break;
+			}
+		} else {
+			if ((rc = rm_get_data(block_ptr, 
+					      RM_PartitionFirstBP, 
+					      &bp_ptr))
+			    != STATUS_OK) {
+				error("rm_get_data(RM_FirstBP): %s", 
+				      bg_err_str(rc));
+				rc = SLURM_ERROR;
+				break;
+			}	
+		}
+		if ((rc = rm_get_data(bp_ptr, RM_BPState, &bp_state))
+		    != STATUS_OK) {
+			error("rm_get_data(RM_BPLoc): %s",
+			      bg_err_str(rc));
+			rc = SLURM_ERROR;
+			break;
+		}
+		if(bp_state == RM_BP_UP)
+			continue;
+		rc = SLURM_ERROR;
+		if ((rc = rm_get_data(bp_ptr, RM_BPID, &bpid))
+		    != STATUS_OK) {
+			error("rm_get_data(RM_BPID): %s",
+			      bg_err_str(rc));
+			break;
+		}
+		coord = find_bp_loc(bpid);
+		
+		if(!coord) {
+			fatal("Could not find coordinates for "
+			      "BP ID %s", (char *) bpid);
+		}
+		free(bpid);
+		slurm_conf_lock();
+		snprintf(bg_down_node, sizeof(bg_down_node), "%s%d%d%d", 
+			 slurmctld_conf.node_prefix,
+			 coord[X], coord[Y], coord[Z]);
+		slurm_conf_unlock();
+	
+		if (node_already_down(bg_down_node))
+			continue;
+
+		debug("check_block_bp_states: %s in state %s", 
+		      bg_down_node, _convert_bp_state(bp_state));
+		if ((strlen(down_node_list) + strlen(bg_down_node) + 2) 
+		    < BUFSIZE) {
+			if (down_node_list[0] != '\0')
+				strcat(down_node_list,",");
+			strcat(down_node_list, bg_down_node);
+		} else
+			error("down_node_list overflow");
+	}
+	
+cleanup:
+	rm_free_partition(block_ptr);
+done:
+	if (down_node_list[0]) {
+		strftime(reason, sizeof(reason), 
+			 "select_bluegene: MMCS state not UP "
+			 "[SLURM@%b %d %H:%M]", 
+			 time_ptr);
+		slurm_drain_nodes(down_node_list, reason);
+	}
+#endif
+	return rc;
+
+}
+
diff --git a/src/plugins/select/bluegene/plugin/state_test.h b/src/plugins/select/bluegene/plugin/state_test.h
index 897ee3be32c..6c1673e68e9 100644
--- a/src/plugins/select/bluegene/plugin/state_test.h
+++ b/src/plugins/select/bluegene/plugin/state_test.h
@@ -37,4 +37,10 @@ extern bool node_already_down(char *node_name);
  */
 extern void test_mmcs_failures(void);
 
+/*
+ * Search MMCS for failed switches and nodes inside of block. 
+ * Failed resources are DRAINED in SLURM. This relies upon rm_get_partition(), 
+ */
+extern int check_block_bp_states(char *bg_block_id);
+
 #endif /* _STATE_TEST_H_ */
diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index a912ac85e6d..5891b6940f5 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -1395,7 +1395,7 @@ static void
 _rpc_reattach_tasks(slurm_msg_t *msg, slurm_addr *cli)
 {
 	reattach_tasks_request_msg_t  *req = msg->data;
-	reattach_tasks_response_msg_t *resp;
+	reattach_tasks_response_msg_t *resp = NULL;
 	slurm_msg_t                    resp_msg;
 	int          rc   = SLURM_SUCCESS;
 	uint16_t     port = 0;
@@ -1405,7 +1405,7 @@ _rpc_reattach_tasks(slurm_msg_t *msg, slurm_addr *cli)
 	int          len;
 	int               fd;
 	uid_t             req_uid;
-	slurmstepd_info_t *step;
+	slurmstepd_info_t *step = NULL;
 
 	resp = xmalloc(sizeof(reattach_tasks_response_msg_t));
 	memset(&resp_msg, 0, sizeof(slurm_msg_t));
@@ -1479,12 +1479,7 @@ done:
 	resp->return_code     = rc;
 
 	slurm_send_only_node_msg(&resp_msg);
-
-	if (resp->gtids)
-		xfree(resp->gtids);
-	if (resp->local_pids)
-		xfree(resp->local_pids);
-	xfree(resp);
+	slurm_free_reattach_tasks_response_msg(resp);
 }
 
 static uid_t 
diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c
index 3fac4e3d61c..d9b2d7c4859 100644
--- a/src/slurmd/slurmd/slurmd.c
+++ b/src/slurmd/slurmd/slurmd.c
@@ -324,6 +324,7 @@ _handle_connection(slurm_fd fd, slurm_addr *cli)
 		errno = rc;
 		xfree(arg);
 		error("Unable to set detachstate on attr: %m");
+		slurm_attr_destroy(&attr);
 		return;
 	}
 
-- 
GitLab