From 24f9552c8f99348087e756e1240a043ee53d620e Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Tue, 9 May 2006 20:33:10 +0000 Subject: [PATCH] more memory leaks fixed along with a race condition fix for the bluegene plugin --- .../block_allocator/block_allocator.c | 7 +- .../select/bluegene/plugin/bg_block_info.c | 19 +-- .../select/bluegene/plugin/bg_job_place.c | 10 ++ .../select/bluegene/plugin/bg_job_run.c | 1 + .../select/bluegene/plugin/block_sys.c | 2 +- src/plugins/select/bluegene/plugin/bluegene.c | 14 ++- .../select/bluegene/plugin/state_test.c | 113 ++++++++++++++++++ .../select/bluegene/plugin/state_test.h | 6 + src/slurmd/slurmd/req.c | 11 +- src/slurmd/slurmd/slurmd.c | 1 + 10 files changed, 162 insertions(+), 22 deletions(-) diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c index 3e2e2954891..931b1d7d6bc 100644 --- a/src/plugins/select/bluegene/block_allocator/block_allocator.c +++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c @@ -2359,13 +2359,14 @@ start_again: return 1; } + remove_block(results, color_count); + list_destroy(results); + results = list_create(NULL); if(ba_request->start_req) goto requested_end; //exit(0); debug2("trying something else"); - remove_block(results, color_count); - list_destroy(results); - results = list_create(NULL); + } #ifdef HAVE_BG diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c index 55d8eb50f29..edc0c784743 100644 --- a/src/plugins/select/bluegene/plugin/bg_block_info.c +++ b/src/plugins/select/bluegene/plugin/bg_block_info.c @@ -63,18 +63,21 @@ static int _block_is_deallocating(bg_record_t *bg_record); static int _block_is_deallocating(bg_record_t *bg_record) { + int jobid = bg_record->job_running; + char *user_name = NULL; slurm_conf_lock(); + user_name = xstrdup(slurmctld_conf.slurm_user_name); if(remove_all_users(bg_record->bg_block_id, NULL) == REMOVE_USER_ERR) { error("Something happened removing " "users from partition %s", bg_record->bg_block_id); } + slurm_conf_unlock(); if(bg_record->target_name && bg_record->user_name) { - if(!strcmp(bg_record->target_name, - slurmctld_conf.slurm_user_name)) { + if(!strcmp(bg_record->target_name, user_name)) { if(strcmp(bg_record->target_name, bg_record->user_name)) { error("Block %s was in a ready state " @@ -82,10 +85,10 @@ static int _block_is_deallocating(bg_record_t *bg_record) "Job %d was lost.", bg_record->bg_block_id, bg_record->user_name, - bg_record->job_running); - if(bg_record->job_running > -1) - slurm_fail_job(bg_record->job_running); + jobid); slurm_mutex_unlock(&block_state_mutex); + if(jobid > -1) + slurm_fail_job(jobid); if(remove_from_bg_list(bg_job_block_list, bg_record) == SLURM_SUCCESS) { @@ -118,12 +121,12 @@ static int _block_is_deallocating(bg_record_t *bg_record) error("Target Name and User Name are " "not set for partition %s.", bg_record->bg_block_id); - bg_record->user_name = - xstrdup(slurmctld_conf.slurm_user_name); + bg_record->user_name = xstrdup(user_name); bg_record->target_name = xstrdup(bg_record->user_name); } - slurm_conf_unlock(); + + xfree(user_name); return SLURM_SUCCESS; } diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index 47360123220..2986a2c068e 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -434,6 +434,16 @@ try_again: /* set the bitmap and do other allocation activities */ if (*found_bg_record) { + if(!test_only) { + if(check_block_bp_states( + (*found_bg_record)->bg_block_id) + == SLURM_ERROR) { + (*found_bg_record)->job_running = -3; + (*found_bg_record)->state = RM_PARTITION_ERROR; + slurm_mutex_unlock(&block_state_mutex); + goto try_again; + } + } format_node_name(*found_bg_record, tmp_char); debug("_find_best_block_match %s <%s>", diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c index 6ec2d6f6a50..0624f837011 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_run.c +++ b/src/plugins/select/bluegene/plugin/bg_job_run.c @@ -467,6 +467,7 @@ static void _term_agent(bg_update_t *bg_update_ptr) bg_record->job_running = -1; /*remove user from list */ + slurm_conf_lock(); if(bg_record->target_name) { if(strcmp(bg_record->target_name, diff --git a/src/plugins/select/bluegene/plugin/block_sys.c b/src/plugins/select/bluegene/plugin/block_sys.c index 93479e6a47f..b69db173d5e 100755 --- a/src/plugins/select/bluegene/plugin/block_sys.c +++ b/src/plugins/select/bluegene/plugin/block_sys.c @@ -564,7 +564,7 @@ int read_bg_blocks() } if ((rc = rm_get_data(bp_ptr, RM_BPID, &bpid)) != STATUS_OK) { - error("rm_get_data(RM_BPLoc): %s", + error("rm_get_data(RM_BPID): %s", bg_err_str(rc)); rc = SLURM_ERROR; break; diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 36610b77be2..c25138eacfc 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -909,6 +909,7 @@ extern int create_defined_blocks(bg_layout_t overlapped) if(!name) { debug("I was unable to make " "the requested block."); + list_iterator_destroy(itr); slurm_mutex_unlock( &block_state_mutex); return SLURM_ERROR; @@ -918,6 +919,7 @@ extern int create_defined_blocks(bg_layout_t overlapped) debug("something happened in the " "load of %s", bg_record->bg_block_id); + list_iterator_destroy(itr); slurm_mutex_unlock( &block_state_mutex); return SLURM_ERROR; @@ -1091,7 +1093,10 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) num_quarter=4; } if(_breakup_blocks(request, my_block_list, &block_inx) - == SLURM_SUCCESS) + != SLURM_SUCCESS) { + debug2("small block not able to be placed"); + //rc = SLURM_ERROR; + } else goto finished; } @@ -1443,7 +1448,9 @@ extern void *mult_free_block(void *args) list_destroy(bg_freeing_list); bg_freeing_list = NULL; } - slurm_mutex_unlock(&freed_cnt_mutex); + slurm_mutex_unlock(&freed_cnt_mutex); + if(free_cnt == 0) + list_destroy(bg_free_block_list); return NULL; } @@ -1547,6 +1554,9 @@ extern void *mult_destroy_block(void *args) bg_freeing_list = NULL; } slurm_mutex_unlock(&freed_cnt_mutex); + if(destroy_cnt == 0) + list_destroy(bg_destroy_block_list); + return NULL; } diff --git a/src/plugins/select/bluegene/plugin/state_test.c b/src/plugins/select/bluegene/plugin/state_test.c index 03833326416..c606eaa4f52 100644 --- a/src/plugins/select/bluegene/plugin/state_test.c +++ b/src/plugins/select/bluegene/plugin/state_test.c @@ -336,3 +336,116 @@ extern void test_mmcs_failures(void) #endif } +extern int check_block_bp_states(char *bg_block_id) +{ + int rc = SLURM_SUCCESS; +#ifdef HAVE_BG_FILES + rm_partition_t *block_ptr = NULL; + rm_BP_t *bp_ptr = NULL; + char *bpid = NULL; + int bp_cnt = 0; + int i = 0; + int *coord = NULL; + rm_BP_state_t bp_state; + char bg_down_node[128], reason[128]; + char down_node_list[BUFSIZE]; + time_t now = time(NULL); + struct tm * time_ptr = localtime(&now); + + down_node_list[0] = '\0'; + slurm_mutex_lock(&api_file_mutex); + if ((rc = rm_get_partition(bg_block_id, &block_ptr)) != STATUS_OK) { + error("Partition %s doesn't exist.", bg_block_id); + rc = SLURM_ERROR; + slurm_mutex_unlock(&api_file_mutex); + goto done; + } + slurm_mutex_unlock(&api_file_mutex); + + if ((rc = rm_get_data(block_ptr, RM_PartitionBPNum, &bp_cnt)) + != STATUS_OK) { + error("rm_get_data(RM_BPNum): %s", bg_err_str(rc)); + rc = SLURM_ERROR; + goto cleanup; + } + + for(i=0; i<bp_cnt; i++) { + if(i) { + if ((rc = rm_get_data(block_ptr, + RM_PartitionNextBP, + &bp_ptr)) + != STATUS_OK) { + error("rm_get_data(RM_NextBP): %s", + bg_err_str(rc)); + rc = SLURM_ERROR; + break; + } + } else { + if ((rc = rm_get_data(block_ptr, + RM_PartitionFirstBP, + &bp_ptr)) + != STATUS_OK) { + error("rm_get_data(RM_FirstBP): %s", + bg_err_str(rc)); + rc = SLURM_ERROR; + break; + } + } + if ((rc = rm_get_data(bp_ptr, RM_BPState, &bp_state)) + != STATUS_OK) { + error("rm_get_data(RM_BPLoc): %s", + bg_err_str(rc)); + rc = SLURM_ERROR; + break; + } + if(bp_state == RM_BP_UP) + continue; + rc = SLURM_ERROR; + if ((rc = rm_get_data(bp_ptr, RM_BPID, &bpid)) + != STATUS_OK) { + error("rm_get_data(RM_BPID): %s", + bg_err_str(rc)); + break; + } + coord = find_bp_loc(bpid); + + if(!coord) { + fatal("Could not find coordinates for " + "BP ID %s", (char *) bpid); + } + free(bpid); + slurm_conf_lock(); + snprintf(bg_down_node, sizeof(bg_down_node), "%s%d%d%d", + slurmctld_conf.node_prefix, + coord[X], coord[Y], coord[Z]); + slurm_conf_unlock(); + + if (node_already_down(bg_down_node)) + continue; + + debug("check_block_bp_states: %s in state %s", + bg_down_node, _convert_bp_state(bp_state)); + if ((strlen(down_node_list) + strlen(bg_down_node) + 2) + < BUFSIZE) { + if (down_node_list[0] != '\0') + strcat(down_node_list,","); + strcat(down_node_list, bg_down_node); + } else + error("down_node_list overflow"); + } + +cleanup: + rm_free_partition(block_ptr); +done: + if (down_node_list[0]) { + strftime(reason, sizeof(reason), + "select_bluegene: MMCS state not UP " + "[SLURM@%b %d %H:%M]", + time_ptr); + slurm_drain_nodes(down_node_list, reason); + } +#endif + return rc; + +} + diff --git a/src/plugins/select/bluegene/plugin/state_test.h b/src/plugins/select/bluegene/plugin/state_test.h index 897ee3be32c..6c1673e68e9 100644 --- a/src/plugins/select/bluegene/plugin/state_test.h +++ b/src/plugins/select/bluegene/plugin/state_test.h @@ -37,4 +37,10 @@ extern bool node_already_down(char *node_name); */ extern void test_mmcs_failures(void); +/* + * Search MMCS for failed switches and nodes inside of block. + * Failed resources are DRAINED in SLURM. This relies upon rm_get_partition(), + */ +extern int check_block_bp_states(char *bg_block_id); + #endif /* _STATE_TEST_H_ */ diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index a912ac85e6d..5891b6940f5 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -1395,7 +1395,7 @@ static void _rpc_reattach_tasks(slurm_msg_t *msg, slurm_addr *cli) { reattach_tasks_request_msg_t *req = msg->data; - reattach_tasks_response_msg_t *resp; + reattach_tasks_response_msg_t *resp = NULL; slurm_msg_t resp_msg; int rc = SLURM_SUCCESS; uint16_t port = 0; @@ -1405,7 +1405,7 @@ _rpc_reattach_tasks(slurm_msg_t *msg, slurm_addr *cli) int len; int fd; uid_t req_uid; - slurmstepd_info_t *step; + slurmstepd_info_t *step = NULL; resp = xmalloc(sizeof(reattach_tasks_response_msg_t)); memset(&resp_msg, 0, sizeof(slurm_msg_t)); @@ -1479,12 +1479,7 @@ done: resp->return_code = rc; slurm_send_only_node_msg(&resp_msg); - - if (resp->gtids) - xfree(resp->gtids); - if (resp->local_pids) - xfree(resp->local_pids); - xfree(resp); + slurm_free_reattach_tasks_response_msg(resp); } static uid_t diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index 3fac4e3d61c..d9b2d7c4859 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -324,6 +324,7 @@ _handle_connection(slurm_fd fd, slurm_addr *cli) errno = rc; xfree(arg); error("Unable to set detachstate on attr: %m"); + slurm_attr_destroy(&attr); return; } -- GitLab