From 91f3a7498a391395e55a69e3c95f97591bf16590 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Tue, 7 Mar 2006 01:22:36 +0000 Subject: [PATCH] appears to work pretty well on bgldev in dynamic mode. more testing is definatly needed though. --- src/api/partition_info.c | 3 +- src/common/slurm_protocol_api.c | 9 +- .../block_allocator/block_allocator.c | 4 +- .../select/bluegene/plugin/bg_block_info.c | 20 ++- .../select/bluegene/plugin/bg_job_place.c | 33 ++-- .../select/bluegene/plugin/bg_job_run.c | 4 +- .../bluegene/plugin/bg_switch_connections.c | 2 +- .../select/bluegene/plugin/block_sys.c | 30 ++-- src/plugins/select/bluegene/plugin/bluegene.c | 148 +++++++++++------- src/plugins/select/bluegene/plugin/bluegene.h | 6 +- src/sinfo/sinfo.c | 2 +- src/smap/smap.c | 3 +- 12 files changed, 152 insertions(+), 112 deletions(-) diff --git a/src/api/partition_info.c b/src/api/partition_info.c index 296b6f23184..65914678fec 100644 --- a/src/api/partition_info.c +++ b/src/api/partition_info.c @@ -181,10 +181,9 @@ extern int slurm_load_partitions (time_t update_time, req_msg.msg_type = REQUEST_PARTITION_INFO; req_msg.data = &req; - if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; - + switch (resp_msg.msg_type) { case RESPONSE_PARTITION_INFO: *resp = (partition_info_msg_t *) resp_msg.data; diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 1c19f5afeaa..d80393b11aa 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -1391,10 +1391,9 @@ int slurm_send_recv_controller_msg(slurm_msg_t *req, slurm_msg_t *resp) * control, we sleep and retry later */ ret_list = _send_and_recv_msg(fd, req, resp, 0); - if(errno == SLURM_SUCCESS) slurm_free_cred(resp->cred); - + rc = errno; if(ret_list) { @@ -1410,8 +1409,7 @@ int slurm_send_recv_controller_msg(slurm_msg_t *req, slurm_msg_t *resp) (resp->msg_type == RESPONSE_SLURM_RC) && ((((return_code_msg_t *) resp->data)->return_code) == ESLURM_IN_STANDBY_MODE) && - (req->msg_type - != MESSAGE_NODE_REGISTRATION_STATUS) && + (req->msg_type != MESSAGE_NODE_REGISTRATION_STATUS) && (slurmctld_conf.backup_controller) && (difftime(time(NULL), start_time) < (slurmctld_conf.slurmctld_timeout + @@ -1432,6 +1430,7 @@ int slurm_send_recv_controller_msg(slurm_msg_t *req, slurm_msg_t *resp) cleanup: if (rc != SLURM_SUCCESS) _remap_slurmctld_errno(); + return rc; } @@ -1803,7 +1802,7 @@ void slurm_free_msg(slurm_msg_t * msg) * Free just the credential of a message */ void slurm_free_cred(void *cred) -{ +{ (void) g_slurm_auth_destroy(cred); } diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c index f1ae861be88..66316ed2ca7 100644 --- a/src/plugins/select/bluegene/block_allocator/block_allocator.c +++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c @@ -597,7 +597,7 @@ extern void ba_init(node_info_msg_t *node_info_ptr) if(ba_system_ptr) _delete_ba_system(); - + ba_system_ptr = (ba_system_t *) xmalloc(sizeof(ba_system_t)); ba_system_ptr->xcord = 1; @@ -646,7 +646,7 @@ extern void ba_init(node_info_msg_t *node_info_ptr) DIM_SIZE[X] = node_info_ptr->record_count; #endif ba_system_ptr->num_of_proc = node_info_ptr->record_count; - } + } node_info_error: #ifdef HAVE_BG_FILES if (have_db2 diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c index 367148f4cb1..f6790f43f80 100644 --- a/src/plugins/select/bluegene/plugin/bg_block_info.c +++ b/src/plugins/select/bluegene/plugin/bg_block_info.c @@ -212,23 +212,19 @@ extern int update_block_list() char *name = NULL; rm_partition_list_t *block_list = NULL; bg_record_t *bg_record = NULL; - //struct passwd *pw_ent = NULL; time_t now; struct tm *time_ptr; char reason[128]; int skipped_dealloc = 0; - if(!blocks_are_created) - return 0; - slurm_mutex_lock(&api_file_mutex); if ((rc = rm_get_partitions_info(block_state, &block_list)) != STATUS_OK) { slurm_mutex_unlock(&api_file_mutex); - error("rm_get_partitions_info(): %s", bg_err_str(rc)); + if(rc != PARTITION_NOT_FOUND) + error("rm_get_partitions_info(): %s", bg_err_str(rc)); return -1; } - slurm_mutex_unlock(&api_file_mutex); if ((rc = rm_get_data(block_list, RM_PartListSize, &num_blocks)) != STATUS_OK) { @@ -236,6 +232,7 @@ extern int update_block_list() updated = -1; num_blocks = 0; } + slurm_mutex_unlock(&api_file_mutex); for (j=0; j<num_blocks; j++) { if (j) { @@ -359,11 +356,11 @@ extern int update_block_list() if(bg_record->boot_state == 1) { switch(bg_record->state) { case RM_PARTITION_CONFIGURING: - debug("checking to make sure user %s " - "is the user.", - bg_record->target_name); - if(update_block_user(bg_record) == 1) - last_bg_update = time(NULL); + /* debug("checking to make sure user %s " */ +/* "is the user.", */ +/* bg_record->target_name); */ +/* if(update_block_user(bg_record) == 1) */ +/* last_bg_update = time(NULL); */ break; case RM_PARTITION_ERROR: error("partition in an error state"); @@ -415,6 +412,7 @@ extern int update_block_list() if ((rc = rm_free_partition_list(block_list)) != STATUS_OK) { error("rm_free_partition_list(): %s", bg_err_str(rc)); } + #endif return updated; } diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index 887f271a6d4..42ee88a8e8a 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -76,8 +76,10 @@ static void _rotate_geo(uint16_t *req_geometry, int rot_cnt) * */ static int _find_best_block_match(struct job_record* job_ptr, - bitstr_t* slurm_block_bitmap, int min_nodes, int max_nodes, - int spec, bg_record_t** found_bg_record, bool test_only) + bitstr_t* slurm_block_bitmap, + int min_nodes, int max_nodes, + int spec, bg_record_t** found_bg_record, + bool test_only) { ListIterator itr; ListIterator itr2; @@ -88,11 +90,11 @@ static int _find_best_block_match(struct job_record* job_ptr, uint32_t req_procs = job_ptr->num_procs; uint32_t proc_cnt; ba_request_t request; - int i, job_running = 0; + int i; int rot_cnt = 0; int created = 0; int found = 0; - int max_procs = NO_VAL; + int max_procs = (uint16_t) NO_VAL; List lists_of_lists = NULL; List temp_list = NULL; char tmp_char[256]; @@ -109,8 +111,6 @@ static int _find_best_block_match(struct job_record* job_ptr, SELECT_DATA_GEOMETRY, &req_geometry); select_g_get_jobinfo(job_ptr->select_jobinfo, SELECT_DATA_ROTATE, &rotate); - select_g_get_jobinfo(job_ptr->select_jobinfo, - SELECT_DATA_ROTATE, &rotate); select_g_get_jobinfo(job_ptr->select_jobinfo, SELECT_DATA_MAX_PROCS, &max_procs); @@ -137,7 +137,8 @@ try_again: debug3("asking for %d-%d looking at %d", req_procs, max_procs, proc_cnt); if ((proc_cnt < req_procs) - || (max_procs != NO_VAL && proc_cnt > max_procs)) { + || (max_procs != (uint16_t) NO_VAL + && proc_cnt > max_procs)) { /* We use the proccessor count per partition here mostly to see if we can run on a smaller partition. */ @@ -190,11 +191,12 @@ try_again: scheduler that it is runnable just not right now. */ debug3("job_running = %d", record->job_running); - if((record->job_running != NO_VAL) + if((record->job_running != -1) && !test_only) { - debug("block %s in use by %s", + debug("block %s in use by %s job %d", record->bg_block_id, - record->user_name); + record->user_name, + record->job_running); found = 1; continue; } @@ -210,7 +212,8 @@ try_again: found_record->bg_block_id))) continue; if(blocks_overlap(record, found_record)) { - if((found_record->job_running != NO_VAL) + if((found_record->job_running + != -1) && !test_only) { debug("can't use %s, there is a job " "(%d) running on an overlapping " @@ -419,7 +422,7 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, select_g_sprint_jobinfo(job_ptr->select_jobinfo, buf, sizeof(buf), - SELECT_PRINT_MIXED); + SELECT_PRINT_MIXED); debug("bluegene:submit_job: %s nodes=%d-%d", buf, min_nodes, @@ -463,17 +466,17 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, select_g_set_jobinfo(job_ptr->select_jobinfo, SELECT_DATA_GEOMETRY, &record->geo); + tmp16 = record->conn_type; select_g_set_jobinfo(job_ptr->select_jobinfo, SELECT_DATA_CONN_TYPE, - &record->conn_type); + &tmp16); } if(test_only) { select_g_set_jobinfo(job_ptr->select_jobinfo, SELECT_DATA_BLOCK_ID, "unassigned"); - } } - + return rc; } diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c index 9e8ccc525eb..4d88c533e7e 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_run.c +++ b/src/plugins/select/bluegene/plugin/bg_job_run.c @@ -284,7 +284,7 @@ static void _start_agent(bg_update_t *bg_update_ptr) sleep(1); } - if(bg_record->job_running == NO_VAL) { + if(bg_record->job_running == -1) { slurm_mutex_unlock(&job_start_mutex); return; } @@ -441,7 +441,7 @@ static void _term_agent(bg_update_t *bg_update_ptr) } slurm_mutex_lock(&block_state_mutex); - bg_record->job_running = NO_VAL; + bg_record->job_running = -1; /*remove user from list */ if(bg_record->target_name) { diff --git a/src/plugins/select/bluegene/plugin/bg_switch_connections.c b/src/plugins/select/bluegene/plugin/bg_switch_connections.c index 5e1762333f9..0f56a774d59 100644 --- a/src/plugins/select/bluegene/plugin/bg_switch_connections.c +++ b/src/plugins/select/bluegene/plugin/bg_switch_connections.c @@ -437,7 +437,7 @@ extern int configure_small_block(bg_record_t *bg_record) } if(bg_record->quarter != quarter) continue; - if(bg_record->segment != NO_VAL) { + if(bg_record->segment != (uint16_t) NO_VAL) { if(bg_record->segment != (i%4)) continue; } diff --git a/src/plugins/select/bluegene/plugin/block_sys.c b/src/plugins/select/bluegene/plugin/block_sys.c index 37f21beacf0..bf6b08a3426 100755 --- a/src/plugins/select/bluegene/plugin/block_sys.c +++ b/src/plugins/select/bluegene/plugin/block_sys.c @@ -147,6 +147,7 @@ static int _post_allocate(bg_record_t *bg_record) /* Add partition record to the DB */ debug("adding partition\n"); + slurm_mutex_lock(&api_file_mutex); for(i=0;i<MAX_ADD_RETRY; i++) { if ((rc = rm_add_partition(bg_record->bg_block)) != STATUS_OK) { @@ -159,11 +160,14 @@ static int _post_allocate(bg_record_t *bg_record) sleep(3); } if(rc == SLURM_ERROR) { + info("going to free it"); if ((rc = rm_free_partition(bg_record->bg_block)) != STATUS_OK) error("rm_free_partition(): %s", bg_err_str(rc)); fatal("couldn't add last partition."); } + slurm_mutex_unlock(&api_file_mutex); + debug("done adding\n"); /* Get back the new partition id */ @@ -193,8 +197,6 @@ static int _post_allocate(bg_record_t *bg_record) } else { bg_record->user_uid = pw_ent->pw_uid; } - last_bg_update = time(NULL); - } /* We are done with the partition */ if ((rc = rm_free_partition(bg_record->bg_block)) != STATUS_OK) @@ -205,6 +207,7 @@ static int _post_allocate(bg_record_t *bg_record) static int _post_bg_init_read(void *object, void *arg) { bg_record_t *bg_record = (bg_record_t *) object; + bg_record_t *tmp_record = NULL; int i = 1024; bg_record->nodes = xmalloc(i); while (hostlist_ranged_string(bg_record->hostlist, i, @@ -219,6 +222,11 @@ static int _post_bg_init_read(void *object, void *arg) fatal("Unable to convert nodes %s to bitmap", bg_record->nodes); } + if(bluegene_layout_mode == LAYOUT_DYNAMIC) { + tmp_record = xmalloc(sizeof(bg_record_t)); + copy_bg_record(bg_record, tmp_record); + list_push(bg_list, tmp_record); + } //print_bg_record(bg_record); return SLURM_SUCCESS; @@ -229,7 +237,7 @@ static int _find_32node_segment(bg_record_t *bg_record, char *my_card_name = NULL; char *card_name = NULL; rm_bp_id_t bp_id = NULL; - int segment = NO_VAL; + int segment = (uint16_t)NO_VAL; int card_count = 0; int num = 0; int i=0; @@ -357,6 +365,7 @@ int read_bg_blocks() rm_partition_list_t *block_list = NULL; rm_partition_state_flag_t state = PARTITION_ALL_FLAG; rm_nodecard_t *ncard = NULL; + rm_quarter_t quarter; bool small = false; slurm_mutex_lock(&api_file_mutex); @@ -370,18 +379,18 @@ int read_bg_blocks() slurm_mutex_lock(&api_file_mutex); if ((rc = rm_get_partitions_info(state, &block_list)) != STATUS_OK) { - error("rm_get_partitions_info(): %s", bg_err_str(rc)); + error("2 rm_get_partitions_info(): %s", bg_err_str(rc)); slurm_mutex_unlock(&api_file_mutex); return SLURM_ERROR; } - slurm_mutex_unlock(&api_file_mutex); if ((rc = rm_get_data(block_list, RM_PartListSize, &block_count)) != STATUS_OK) { error("rm_get_data(RM_PartListSize): %s", bg_err_str(rc)); block_count = 0; } + slurm_mutex_unlock(&api_file_mutex); for(block_number=0; block_number<block_count; block_number++) { @@ -434,15 +443,15 @@ int read_bg_blocks() bg_record = xmalloc(sizeof(bg_record_t)); list_push(bg_curr_block_list, bg_record); - + bg_record->bg_block_id = xstrdup(block_name); free(block_name); bg_record->state = NO_VAL; - bg_record->quarter = NO_VAL; - bg_record->segment = NO_VAL; - bg_record->job_running = NO_VAL; + bg_record->quarter = (uint16_t) NO_VAL; + bg_record->segment = (uint16_t) NO_VAL; + bg_record->job_running = -1; if ((rc = rm_get_data(block_ptr, RM_PartitionBPNum, @@ -472,10 +481,11 @@ int read_bg_blocks() } if ((rc = rm_get_data(ncard, RM_NodeCardQuarter, - &bg_record->quarter)) != STATUS_OK) { + &quarter)) != STATUS_OK) { error("rm_get_data(CardQuarter): %d",rc); bp_cnt = 0; } + bg_record->quarter = quarter; if((rc = rm_get_data(block_ptr, RM_PartitionNodeCardNum, &i)) diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 0814b7db0a1..3330a913516 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -91,7 +91,7 @@ static char *_get_bg_conf(void); static void _strip_13_10(char *line); static int _split_block(bg_record_t *bg_record, int procs, int *block_inx); static bg_record_t *_create_small_record(bg_record_t *bg_record, - int quarter, int segment); + uint16_t quarter, uint16_t segment); static int _add_bg_record(List records, char *nodes, rm_connection_type_t conn_type, int num_segment, int num_quarter); @@ -243,6 +243,8 @@ extern void destroy_bg_record(void *object) extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record) { + int i; + xfree(sec_record->bg_block_id); sec_record->bg_block_id = xstrdup(fir_record->bg_block_id); xfree(sec_record->nodes); @@ -260,10 +262,18 @@ extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record) sec_record->switch_count = fir_record->switch_count; sec_record->boot_state = fir_record->boot_state; sec_record->boot_count = fir_record->boot_count; + + for(i=0;i<BA_SYSTEM_DIMENSIONS;i++) { + sec_record->geo[i] = fir_record->geo[i]; + sec_record->start[i] = fir_record->start[i]; + } + if(sec_record->bitmap) bit_free(sec_record->bitmap); - if((sec_record->bitmap = bit_copy(fir_record->bitmap)) == NULL) { - error("Unable to copy bitmap for", fir_record->nodes); + if(fir_record->bitmap + && (sec_record->bitmap = bit_copy(fir_record->bitmap)) == NULL) { + error("Unable to copy bitmap for %s", fir_record->nodes); + sec_record->bitmap = NULL; } sec_record->job_running = fir_record->job_running; sec_record->cpus_per_bp = fir_record->cpus_per_bp; @@ -360,8 +370,8 @@ extern int update_block_user(bg_record_t *bg_record) extern int format_node_name(bg_record_t *bg_record, char tmp_char[]) { - if(bg_record->quarter != NO_VAL) { - if(bg_record->segment != NO_VAL) { + if(bg_record->quarter != (uint16_t)NO_VAL) { + if(bg_record->segment != (uint16_t)NO_VAL) { sprintf(tmp_char,"%s.%d.%d\0", bg_record->nodes, bg_record->quarter, @@ -389,13 +399,13 @@ extern bool blocks_overlap(bg_record_t *rec_a, bg_record_t *rec_b) } bit_free(my_bitmap); - if(rec_a->quarter != NO_VAL) { - if(rec_b->quarter == NO_VAL) + if(rec_a->quarter != (uint16_t) NO_VAL) { + if(rec_b->quarter == (uint16_t) NO_VAL) return true; else if(rec_a->quarter != rec_b->quarter) return false; - if(rec_a->segment != NO_VAL) { - if(rec_b->segment == NO_VAL) + if(rec_a->segment != (uint16_t) NO_VAL) { + if(rec_b->segment == (uint16_t) NO_VAL) return true; else if(rec_a->segment != rec_b->segment) @@ -583,13 +593,14 @@ extern void *bluegene_agent(void *args) if (difftime(now, last_bg_test) >= BG_POLL_TIME) { if (agent_fini) /* don't bother */ return NULL; /* quit now */ - if(last_bg_update) { + if(blocks_are_created) { last_bg_test = now; - if((rc = update_block_list()) == 1) + if((rc = update_block_list()) == 1) { + slurm_mutex_lock(&block_state_mutex); last_bg_update = now; - else if(rc == -1) - error("Error " - "with update_block_list"); + slurm_mutex_unlock(&block_state_mutex); + } else if(rc == -1) + error("Error with update_block_list"); } } @@ -751,7 +762,6 @@ extern int create_defined_blocks(bg_layout_t overlapped) return SLURM_ERROR; } #endif - last_bg_update = time(NULL); slurm_mutex_unlock(&block_state_mutex); create_full_system_block(); @@ -881,7 +891,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) itr = list_iterator_create(bg_list); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { - if(bg_record->job_running != NO_VAL) + if(bg_record->job_running != -1) continue; if(bg_record->state != RM_PARTITION_FREE) continue; @@ -907,7 +917,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) itr = list_iterator_create(bg_list); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { - if(bg_record->job_running != NO_VAL) + if(bg_record->job_running != -1) continue; proc_cnt = bg_record->bp_count * bg_record->cpus_per_bp; @@ -932,9 +942,15 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) &block_inx) == SLURM_SUCCESS) { list_remove(itr); + request->save_name = + xmalloc(sizeof(char) * 4); + sprintf(request->save_name, + "%d%d%d\0", + bg_record->start[X], + bg_record->start[Y], + bg_record->start[Z]); destroy_bg_record(bg_record); } - list_iterator_destroy(itr); rc = SLURM_SUCCESS; goto finished; @@ -966,7 +982,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) request->rotate_count = 0; request->elongate_count = 1; - if((bg_record->job_running == NO_VAL) + if((bg_record->job_running == -1) && (bg_record->cpus_per_bp == procs_per_node || (bg_record->quarter == 0 && bg_record->segment < 1))) { @@ -1032,7 +1048,6 @@ finished: if(results) list_destroy(results); - last_bg_update = time(NULL); slurm_mutex_unlock(&block_state_mutex); sort_bg_record_inc_size(bg_list); @@ -1178,12 +1193,14 @@ extern int bg_free_block(bg_record_t *bg_record) && bg_record->state != RM_PARTITION_DEALLOCATING) { #ifdef HAVE_BG_FILES debug("pm_destroy %s",bg_record->bg_block_id); - if ((rc = pm_destroy_partition( - bg_record->bg_block_id)) - != STATUS_OK) { + + slurm_mutex_lock(&api_file_mutex); + rc = pm_destroy_partition(bg_record->bg_block_id); + if (rc != STATUS_OK) { if(rc == PARTITION_NOT_FOUND) { debug("block %s is not found", bg_record->bg_block_id); + slurm_mutex_unlock(&api_file_mutex); break; } error("pm_destroy_partition(%s): %s " @@ -1191,6 +1208,7 @@ extern int bg_free_block(bg_record_t *bg_record) bg_record->bg_block_id, bg_err_str(rc), bg_record->state); } + slurm_mutex_unlock(&api_file_mutex); #else slurm_mutex_lock(&block_state_mutex); bg_record->state = RM_PARTITION_FREE; @@ -1268,18 +1286,26 @@ extern void *mult_destroy_block(void *args) bg_free_block(bg_record); #ifdef HAVE_BG_FILES - rc = rm_remove_partition( - bg_record->bg_block_id); + debug("removing from database %s", + (char *)bg_record->bg_block_id); + + slurm_mutex_lock(&api_file_mutex); + rc = rm_remove_partition(bg_record->bg_block_id); if (rc != STATUS_OK) { error("rm_remove_partition(%s): %s", bg_record->bg_block_id, bg_err_str(rc)); } else debug("done\n"); + slurm_mutex_unlock(&api_file_mutex); + #endif slurm_mutex_lock(&freed_cnt_mutex); num_block_freed++; - destroy_bg_record(bg_record); + slurm_mutex_lock(&block_state_mutex); + if(blocks_are_created) + destroy_bg_record(bg_record); + slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&freed_cnt_mutex); } slurm_mutex_lock(&freed_cnt_mutex); @@ -1479,9 +1505,6 @@ extern int read_bg_conf(void) /* looking for blocks only I created */ if(bluegene_layout_mode == LAYOUT_DYNAMIC) { init_wires(); - slurm_mutex_lock(&block_state_mutex); - last_bg_update = time(NULL); - slurm_mutex_unlock(&block_state_mutex); info("No blocks created until jobs are submitted"); } else { if (create_defined_blocks(bluegene_layout_mode) @@ -1494,8 +1517,17 @@ extern int read_bg_conf(void) return SLURM_ERROR; } } - debug("Blocks have finished being created."); + + slurm_mutex_lock(&block_state_mutex); + list_destroy(bg_curr_block_list); + bg_curr_block_list = NULL; + list_destroy(bg_found_block_list); + bg_curr_block_list = NULL; + last_bg_update = time(NULL); blocks_are_created = 1; + slurm_mutex_unlock(&block_state_mutex); + sort_bg_record_inc_size(bg_list); + debug("Blocks have finished being created."); return error_code; } @@ -1520,10 +1552,9 @@ static int _update_bg_record_state(List bg_destroy_list) if ((rc = rm_get_partitions_info(block_state, &block_list)) != STATUS_OK) { slurm_mutex_unlock(&api_file_mutex); - error("rm_get_partitions_info(): %s", bg_err_str(rc)); + error("1 rm_get_partitions_info(): %s", bg_err_str(rc)); return SLURM_ERROR; } - slurm_mutex_unlock(&api_file_mutex); if ((rc = rm_get_data(block_list, RM_PartListSize, &num_blocks)) != STATUS_OK) { @@ -1531,7 +1562,7 @@ static int _update_bg_record_state(List bg_destroy_list) func_rc = SLURM_ERROR; num_blocks = 0; } - + for (j=0; j<num_blocks; j++) { if (j) { if ((rc = rm_get_data(block_list, @@ -1568,6 +1599,7 @@ static int _update_bg_record_state(List bg_destroy_list) continue; } + slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_destroy_list); while ((bg_record = (bg_record_t*) list_next(itr))) { if(!bg_record->bg_block_id) @@ -1576,7 +1608,6 @@ static int _update_bg_record_state(List bg_destroy_list) continue; } - slurm_mutex_lock(&block_state_mutex); if ((rc = rm_get_data(block_ptr, RM_PartitionState, &state)) @@ -1589,16 +1620,19 @@ static int _update_bg_record_state(List bg_destroy_list) name, bg_record->state, state); bg_record->state = state; } - slurm_mutex_unlock(&block_state_mutex); break; } list_iterator_destroy(itr); + slurm_mutex_unlock(&block_state_mutex); + free(name); } if ((rc = rm_free_partition_list(block_list)) != STATUS_OK) { error("rm_free_partition_list(): %s", bg_err_str(rc)); } + slurm_mutex_unlock(&api_file_mutex); + return func_rc; } #endif /* HAVE_BG_FILES */ @@ -1729,7 +1763,7 @@ static int _validate_config_nodes(void) "no bg_curr_block_list"); } if (!bg_record->bg_block_id) { - _format_node_name(bg_record, tmp_char); + format_node_name(bg_record, tmp_char); info("Block found in bluegene.conf to be " "created: Nodes:%s", @@ -1737,7 +1771,7 @@ static int _validate_config_nodes(void) rc = SLURM_ERROR; } else { list_append(bg_found_block_list, bg_record); - _format_node_name(bg_record, tmp_char); + format_node_name(bg_record, tmp_char); info("Found existing BG BlockID:%s " "Nodes:%s Conn:%s", @@ -1748,8 +1782,7 @@ static int _validate_config_nodes(void) } list_iterator_destroy(itr_conf); if(bg_curr_block_list) { - itr_curr = list_iterator_create( - bg_curr_block_list); + itr_curr = list_iterator_create(bg_curr_block_list); while ((init_bg_record = (bg_record_t*) list_next(itr_curr)) != NULL) { @@ -1777,7 +1810,7 @@ static int _validate_config_nodes(void) bg_record->full_block = 1; debug("full system %s", bg_record->bg_block_id); - _format_node_name(bg_record, tmp_char); + format_node_name(bg_record, tmp_char); info("Found existing BG " "BlockID:%s " "Nodes:%s Conn:%s", @@ -1956,7 +1989,7 @@ static int _delete_old_blocks(void) } list_destroy(bg_destroy_list); - + info("I am done deleting"); #endif return SLURM_SUCCESS; } @@ -2000,17 +2033,16 @@ static void _strip_13_10(char *line) static int _split_block(bg_record_t *bg_record, int procs, int *block_inx) { bg_record_t *found_record = NULL; - ba_node_t *ba_node = NULL; bool full_bp = false; int small_count = 0; int small_size = 0; int num_segment = 0, num_quarter = 0; int i; int node_cnt = 0; - int quarter = 0; - int segment = 0; + uint16_t quarter = 0; + uint16_t segment = 0; - if(bg_record->quarter == NO_VAL) + if(bg_record->quarter == (uint16_t) NO_VAL) full_bp = true; if(procs == (procs_per_node/16)) { @@ -2027,7 +2059,6 @@ static int _split_block(bg_record_t *bg_record, int procs, int *block_inx) num_segment, bg_record->node_cnt); small_count = num_segment+num_quarter; - ba_node = list_pop(bg_record->bg_block_list); /* break base partition up into 16 parts */ small_size = bluegene_bp_node_cnt/bluegene_segment_node_cnt; node_cnt = 0; @@ -2043,7 +2074,7 @@ static int _split_block(bg_record_t *bg_record, int procs, int *block_inx) } if(small_size == 4) - segment = NO_VAL; + segment = (uint16_t)NO_VAL; else segment = i%4; found_record = _create_small_record(bg_record, @@ -2069,20 +2100,19 @@ static int _split_block(bg_record_t *bg_record, int procs, int *block_inx) quarter++; } } - - + return SLURM_SUCCESS; } static bg_record_t *_create_small_record(bg_record_t *bg_record, - int quarter, int segment) + uint16_t quarter, uint16_t segment) { bg_record_t *found_record = NULL; int small_size = 4; found_record = (bg_record_t*) xmalloc(sizeof(bg_record_t)); - found_record->job_running = NO_VAL; + found_record->job_running = -1; found_record->user_name = xstrdup(bg_record->user_name); found_record->user_uid = bg_record->user_uid; found_record->bg_block_list = list_create(NULL); @@ -2094,7 +2124,7 @@ static bg_record_t *_create_small_record(bg_record_t *bg_record, found_record->conn_type = SELECT_SMALL; found_record->node_use = SELECT_COPROCESSOR_MODE; - if(segment != NO_VAL) + if(segment != (uint16_t) NO_VAL) small_size = 16; found_record->cpus_per_bp = procs_per_node/small_size; found_record->node_cnt = bluegene_bp_node_cnt/small_size; @@ -2116,8 +2146,8 @@ static int _add_bg_record(List records, char *nodes, int i, len; int small_size = 0; int small_count = 0; - int quarter = 0; - int segment = 0; + uint16_t quarter = 0; + uint16_t segment = 0; int node_cnt = 0; bg_record = (bg_record_t*) xmalloc(sizeof(bg_record_t)); @@ -2133,8 +2163,8 @@ static int _add_bg_record(List records, char *nodes, } bg_record->bg_block_list = list_create(NULL); bg_record->hostlist = hostlist_create(NULL); - bg_record->quarter = NO_VAL; - bg_record->segment = NO_VAL; + bg_record->quarter = (uint16_t)NO_VAL; + bg_record->segment = (uint16_t)NO_VAL; /* bg_record->boot_state = 0; Implicit */ /* bg_record->state = 0; Implicit */ debug2("asking for %s %d %d",nodes, num_quarter, num_segment); @@ -2182,8 +2212,8 @@ static int _add_bg_record(List records, char *nodes, bg_record->conn_type = conn_type; bg_record->cpus_per_bp = procs_per_node; bg_record->node_cnt = bluegene_bp_node_cnt * bg_record->bp_count; - bg_record->job_running = NO_VAL; - + bg_record->job_running = -1; + if(bg_record->conn_type != SELECT_SMALL) list_append(records, bg_record); else { @@ -2225,7 +2255,7 @@ static int _add_bg_record(List records, char *nodes, if(small_size == 4) - segment = NO_VAL; + segment = (uint16_t)NO_VAL; else segment = i%4; found_record = _create_small_record(bg_record, diff --git a/src/plugins/select/bluegene/plugin/bluegene.h b/src/plugins/select/bluegene/plugin/bluegene.h index 13050214390..080e8ba73d5 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.h +++ b/src/plugins/select/bluegene/plugin/bluegene.h @@ -87,10 +87,10 @@ typedef struct bg_record { int job_running; /* job id if there is a job running on the block */ int cpus_per_bp; /* count of cpus per base part */ - int node_cnt; /* count of nodes per block */ - int quarter; /* used for small blocks + uint32_t node_cnt; /* count of nodes per block */ + uint16_t quarter; /* used for small blocks determine quarter of BP */ - int segment; /* used for small blocks + uint16_t segment; /* used for small blocks determine segment of quarter */ } bg_record_t; diff --git a/src/sinfo/sinfo.c b/src/sinfo/sinfo.c index 75e746efa4d..991fd61a313 100644 --- a/src/sinfo/sinfo.c +++ b/src/sinfo/sinfo.c @@ -230,7 +230,7 @@ _query_server(partition_info_msg_t ** part_pptr, } else error_code = slurm_load_partitions((time_t) NULL, &new_part_ptr, - show_flags); + show_flags); if (error_code) { slurm_perror("slurm_load_part"); return error_code; diff --git a/src/smap/smap.c b/src/smap/smap.c index 9245b4ae270..22fb693d961 100644 --- a/src/smap/smap.c +++ b/src/smap/smap.c @@ -76,7 +76,6 @@ int main(int argc, char *argv[]) log_init(xbasename(argv[0]), opts, SYSLOG_FACILITY_DAEMON, NULL); parse_command_line(argc, argv); - while (slurm_load_node((time_t) NULL, &new_node_ptr, SHOW_ALL)) { error_code = slurm_get_errno(); printf("slurm_load_node: %s\n", slurm_strerror(error_code)); @@ -89,7 +88,9 @@ int main(int argc, char *argv[]) sleep(10); /* keep trying to reconnect */ } + printf("starting\n"); ba_init(new_node_ptr); + printf("done hey\n"); if(params.partition) { -- GitLab