From 607adb30b97b4bcc64bcaca8a4e23aaf0f88b041 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Mon, 15 Jun 2009 22:16:40 +0000 Subject: [PATCH] better locking in bluegene --- .../select/bluegene/plugin/bg_block_info.c | 2 +- .../select/bluegene/plugin/bg_job_run.c | 53 ++++++++++--------- .../bluegene/plugin/bg_record_functions.c | 11 ++-- .../select/bluegene/plugin/block_sys.c | 2 + .../select/bluegene/plugin/select_bluegene.c | 6 ++- 5 files changed, 42 insertions(+), 32 deletions(-) diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c index cbbb6698fa5..ba43680507a 100644 --- a/src/plugins/select/bluegene/plugin/bg_block_info.c +++ b/src/plugins/select/bluegene/plugin/bg_block_info.c @@ -176,8 +176,8 @@ extern int block_ready(struct job_record *job_ptr) rc = select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_BLOCK_ID, &block_id); if (rc == SLURM_SUCCESS) { - bg_record = find_bg_record_in_list(bg_lists->main, block_id); slurm_mutex_lock(&block_state_mutex); + bg_record = find_bg_record_in_list(bg_lists->main, block_id); if(bg_record) { if(bg_record->job_running != job_ptr->job_id) { diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c index ab69304e59f..29c4951a533 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_run.c +++ b/src/plugins/select/bluegene/plugin/bg_job_run.c @@ -297,13 +297,14 @@ static void _sync_agent(bg_update_t *bg_update_ptr) { bg_record_t * bg_record = NULL; + slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_update_ptr->bg_block_id); if(!bg_record) { + slurm_mutex_unlock(&block_state_mutex); error("No block %s", bg_update_ptr->bg_block_id); return; } - slurm_mutex_lock(&block_state_mutex); last_bg_update = time(NULL); bg_record->job_running = bg_update_ptr->job_ptr->job_id; @@ -315,13 +316,11 @@ static void _sync_agent(bg_update_t *bg_update_ptr) } if(!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); - slurm_mutex_unlock(&block_state_mutex); if(bg_record->state == RM_PARTITION_READY) { if(bg_record->user_uid != bg_update_ptr->job_ptr->user_id) { int set_user_rc = SLURM_SUCCESS; - slurm_mutex_lock(&block_state_mutex); debug("User isn't correct for job %d on %s, " "fixing...", bg_update_ptr->job_ptr->job_id, @@ -334,7 +333,9 @@ static void _sync_agent(bg_update_t *bg_update_ptr) if(set_user_rc == SLURM_ERROR) (void) slurm_fail_job(bg_record->job_running); - } + } else + slurm_mutex_unlock(&block_state_mutex); + } else { if(bg_record->state != RM_PARTITION_CONFIGURING) { error("Block %s isn't ready and isn't " @@ -344,6 +345,7 @@ static void _sync_agent(bg_update_t *bg_update_ptr) debug("Block %s is booting, job ok", bg_update_ptr->bg_block_id); } + slurm_mutex_unlock(&block_state_mutex); _start_agent(bg_update_ptr); } } @@ -362,10 +364,12 @@ static void _start_agent(bg_update_t *bg_update_ptr) slurm_mutex_lock(&job_start_mutex); + slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_update_ptr->bg_block_id); if(!bg_record) { + slurm_mutex_unlock(&block_state_mutex); error("block %s not found in bg_lists->main", bg_update_ptr->bg_block_id); /* wait for the slurmd to begin @@ -385,7 +389,7 @@ static void _start_agent(bg_update_t *bg_update_ptr) slurm_mutex_unlock(&job_start_mutex); return; } - slurm_mutex_lock(&block_state_mutex); + if(bg_record->job_running <= NO_JOB_RUNNING) { // _reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); @@ -816,10 +820,10 @@ static void _term_agent(bg_update_t *bg_update_ptr) #endif /* remove the block's users */ + slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_update_ptr->bg_block_id); if(bg_record) { - slurm_mutex_lock(&block_state_mutex); debug("got the record %s user is %s", bg_record->bg_block_id, bg_record->user_name); @@ -838,10 +842,7 @@ static void _term_agent(bg_update_t *bg_update_ptr) bg_update_ptr->bg_block_id); } - _reset_block(bg_record); - - slurm_mutex_unlock(&block_state_mutex); - + _reset_block(bg_record); } else if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { debug2("Hopefully we are destroying this block %s " "since it isn't in the bg_lists->main", @@ -854,6 +855,7 @@ static void _term_agent(bg_update_t *bg_update_ptr) "happen outside of that.", bg_update_ptr->bg_block_id); } + slurm_mutex_unlock(&block_state_mutex); #ifdef HAVE_BG_FILES if ((rc = bridge_free_job_list(job_list)) != STATUS_OK) @@ -1121,29 +1123,32 @@ extern int start_job(struct job_record *job_ptr) SELECT_JOBDATA_RAMDISK_IMAGE, bg_update_ptr->ramdiskimage); } + + slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_update_ptr->bg_block_id); - if (bg_record) { - slurm_mutex_lock(&block_state_mutex); - last_bg_update = time(NULL); - job_ptr->num_procs = bg_record->cpu_cnt; - job_ptr->total_procs = job_ptr->num_procs; - bg_record->job_running = bg_update_ptr->job_ptr->job_id; - bg_record->job_ptr = bg_update_ptr->job_ptr; - if(!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) { - list_push(bg_lists->job_running, bg_record); - num_unused_cpus -= bg_record->cpu_cnt; - } - if(!block_ptr_exist_in_list(bg_lists->booted, bg_record)) - list_push(bg_lists->booted, bg_record); + if (!bg_record) { slurm_mutex_unlock(&block_state_mutex); - } else { error("bg_record %s doesn't exist, requested for job (%d)", bg_update_ptr->bg_block_id, job_ptr->job_id); _bg_list_del(bg_update_ptr); return SLURM_ERROR; } + + last_bg_update = time(NULL); + job_ptr->num_procs = bg_record->cpu_cnt; + job_ptr->total_procs = job_ptr->num_procs; + bg_record->job_running = bg_update_ptr->job_ptr->job_id; + bg_record->job_ptr = bg_update_ptr->job_ptr; + if(!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) { + list_push(bg_lists->job_running, bg_record); + num_unused_cpus -= bg_record->cpu_cnt; + } + if(!block_ptr_exist_in_list(bg_lists->booted, bg_record)) + list_push(bg_lists->booted, bg_record); + slurm_mutex_unlock(&block_state_mutex); + info("Queue start of job %u in BG block %s", job_ptr->job_id, bg_update_ptr->bg_block_id); diff --git a/src/plugins/select/bluegene/plugin/bg_record_functions.c b/src/plugins/select/bluegene/plugin/bg_record_functions.c index 35be54c0fa4..cacb4dfd0f7 100644 --- a/src/plugins/select/bluegene/plugin/bg_record_functions.c +++ b/src/plugins/select/bluegene/plugin/bg_record_functions.c @@ -494,6 +494,9 @@ extern int bg_record_cmpf_inc(bg_record_t* rec_a, bg_record_t* rec_b) return 0; } +/* if looking at the main list this should have some nice + * block_state_mutex locks around it. + */ extern bg_record_t *find_bg_record_in_list(List my_list, char *bg_block_id) { ListIterator itr; @@ -504,16 +507,14 @@ extern bg_record_t *find_bg_record_in_list(List my_list, char *bg_block_id) if(!bg_block_id) return NULL; - slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(my_list); - while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { + while((bg_record = list_next(itr))) { if(bg_record->bg_block_id) - if (!strcmp(bg_record->bg_block_id, - bg_block_id)) + if(!strcmp(bg_record->bg_block_id, bg_block_id)) break; } list_iterator_destroy(itr); - slurm_mutex_unlock(&block_state_mutex); + if(bg_record) return bg_record; else diff --git a/src/plugins/select/bluegene/plugin/block_sys.c b/src/plugins/select/bluegene/plugin/block_sys.c index d4d3b8172ad..edbbd3a6a78 100755 --- a/src/plugins/select/bluegene/plugin/block_sys.c +++ b/src/plugins/select/bluegene/plugin/block_sys.c @@ -1065,6 +1065,7 @@ extern int load_state_file(List curr_block_list, char *dir_name) /* we only care about the states we need here * everthing else should have been set up already */ if(bg_info_record->state == RM_PARTITION_ERROR) { + slurm_mutex_lock(&block_state_mutex); if((bg_record = find_bg_record_in_list( curr_block_list, bg_info_record->bg_block_id))) @@ -1075,6 +1076,7 @@ extern int load_state_file(List curr_block_list, char *dir_name) around in bg_lists->main. */ bg_record->state = bg_info_record->state; + slurm_mutex_unlock(&block_state_mutex); } } diff --git a/src/plugins/select/bluegene/plugin/select_bluegene.c b/src/plugins/select/bluegene/plugin/select_bluegene.c index 328fbf3c036..65d1b943b29 100644 --- a/src/plugins/select/bluegene/plugin/select_bluegene.c +++ b/src/plugins/select/bluegene/plugin/select_bluegene.c @@ -706,11 +706,13 @@ extern int select_p_update_block (update_part_msg_t *part_desc_ptr) time_t now; char reason[128], tmp[64], time_str[32]; + slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, part_desc_ptr->name); - if(!bg_record) + if(!bg_record) { + slurm_mutex_unlock(&block_state_mutex); return SLURM_ERROR; + } - slurm_mutex_lock(&block_state_mutex); now = time(NULL); slurm_make_time_str(&now, time_str, sizeof(time_str)); snprintf(tmp, sizeof(tmp), "[SLURM@%s]", time_str); -- GitLab