diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c index f8bbe2c8569dccc9342151a04af733d5d05c99da..d98c5623373b35fa9c0267a7606e7c0ba63ff05b 100644 --- a/src/plugins/select/bluegene/block_allocator/block_allocator.c +++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c @@ -1269,6 +1269,7 @@ extern int load_block_wiring(char *bg_block_id) int cnt = 0; int switch_cnt = 0; rm_switch_t *curr_switch = NULL; + rm_BP_t *curr_bp = NULL; char *switchid = NULL; rm_connection_t curr_conn; int dim; @@ -1293,6 +1294,32 @@ extern int load_block_wiring(char *bg_block_id) bg_err_str(rc)); return SLURM_ERROR; } + if(!switch_cnt) { + debug("no switch_cnt"); + if ((rc = rm_get_data(block_ptr, + RM_PartitionFirstBP, + &curr_bp)) + != STATUS_OK) { + error("rm_get_data: " + "RM_PartitionFirstBP: %s", + bg_err_str(rc)); + return SLURM_ERROR; + } + if ((rc = rm_get_data(curr_bp, RM_BPID, &switchid)) + != STATUS_OK) { + error("rm_get_data: RM_SwitchBPID: %s", + bg_err_str(rc)); + return SLURM_ERROR; + } + + geo = find_bp_loc(switchid); + if(!geo) { + error("find_bp_loc: bpid %s not known", switchid); + return SLURM_ERROR; + } + ba_system_ptr->grid[geo[X]][geo[Y]][geo[Z]].used = true; + return SLURM_SUCCESS; + } for (i=0; i<switch_cnt; i++) { if(i) { if ((rc = rm_get_data(block_ptr, @@ -1333,6 +1360,7 @@ extern int load_block_wiring(char *bg_block_id) error("find_bp_loc: bpid %s not known", switchid); return SLURM_ERROR; } + if ((rc = rm_get_data(curr_switch, RM_SwitchConnNum, &cnt)) != STATUS_OK) { error("rm_get_data: RM_SwitchBPID: %s", @@ -1342,7 +1370,8 @@ extern int load_block_wiring(char *bg_block_id) debug("switch id = %s dim %d conns = %d", switchid, dim, cnt); ba_switch = &ba_system_ptr-> - grid[geo[X]][geo[Y]][geo[Z]].axis_switch[dim]; + grid[geo[X]][geo[Y]][geo[Z]].axis_switch[dim]; + ba_system_ptr->grid[geo[X]][geo[Y]][geo[Z]].used = true; for (j=0; j<cnt; j++) { if(j) { if ((rc = rm_get_data(curr_switch, @@ -1398,19 +1427,7 @@ extern int load_block_wiring(char *bg_block_id) } debug("connection going from %d -> %d", curr_conn.p1, curr_conn.p2); - if(curr_conn.p1 == 1) { - if(ba_system_ptr-> - grid[geo[X]][geo[Y]][geo[Z]].used) { - error("%d%d%d is already in use", - geo[X], - geo[Y], - geo[Z]); - return SLURM_ERROR; - } - ba_system_ptr-> - grid[geo[X]][geo[Y]][geo[Z]].used = 1; - } - + if(ba_switch->int_wire[curr_conn.p1].used) { error("%d%d%d dim %d port %d " "is already in use", @@ -1424,7 +1441,7 @@ extern int load_block_wiring(char *bg_block_id) ba_switch->int_wire[curr_conn.p1].used = 1; ba_switch->int_wire[curr_conn.p1].port_tar = curr_conn.p2; - + if(ba_switch->int_wire[curr_conn.p2].used) { error("%d%d%d dim %d port %d " "is already in use", @@ -1438,7 +1455,7 @@ extern int load_block_wiring(char *bg_block_id) ba_switch->int_wire[curr_conn.p2].used = 1; ba_switch->int_wire[curr_conn.p2].port_tar = curr_conn.p1; - } + } } return SLURM_SUCCESS; @@ -2249,6 +2266,7 @@ static int _find_match(ba_request_t *ba_request, List results) ba_node_t *ba_node = NULL; char *name=NULL; int startx = (start[X]-1); + if(startx == -1) startx = DIM_SIZE[X]-1; if(ba_request->start_req) { @@ -2311,15 +2329,23 @@ start_again: ; if (!_node_used(ba_node, ba_request->geometry)) { + info("trying this node %d%d%d %d%d%d %d", + start[X], start[Y], start[Z], + ba_request->geometry[X], + ba_request->geometry[Y], + ba_request->geometry[Z], + ba_request->conn_type); name = set_bg_block(results, start, ba_request->geometry, ba_request->conn_type); if(name) { + info("yes"); ba_request->save_name = xstrdup(name); xfree(name); return 1; } + info("nope"); if(ba_request->start_req) goto requested_end; //exit(0); @@ -2328,6 +2354,7 @@ start_again: list_destroy(results); results = list_create(NULL); } + info("got here"); #ifdef HAVE_BG if((DIM_SIZE[Z]-start[Z]-1) @@ -2344,6 +2371,8 @@ start_again: >= ba_request->geometry[X]) start[X]++; else { + if(ba_request->size == 1) + goto requested_end; if(!_check_for_options(ba_request)) return 0; else { @@ -2359,7 +2388,7 @@ start_again: #endif } requested_end: - debug("can't allocate"); + info("can't allocate"); return 0; } diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index 9af62a0d856b33dc1537caaefb977dc8115ac07e..74521bd0d6d8949c4e8bf862769bbe5305f56dcc 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -131,7 +131,6 @@ static int _find_best_block_match(struct job_record* job_ptr, *found_bg_record = NULL; try_again: - debug("got here"); slurm_mutex_lock(&block_state_mutex); debug("number of blocks to check: %d state %d", list_count(bg_list), @@ -326,7 +325,7 @@ try_again: if(!found && test_only && bluegene_layout_mode == LAYOUT_DYNAMIC) { slurm_mutex_unlock(&block_state_mutex); - + for(i=0; i<BA_SYSTEM_DIMENSIONS; i++) request.start[i] = 0; @@ -376,8 +375,13 @@ try_again: slurm_mutex_unlock(&block_state_mutex); lists_of_lists = list_create(NULL); list_append(lists_of_lists, bg_list); - list_append(lists_of_lists, bg_booted_block_list); - list_append(lists_of_lists, bg_job_block_list); + if(list_count(bg_list) + != list_count(bg_booted_block_list)) + list_append(lists_of_lists, bg_booted_block_list); + if(list_count(bg_booted_block_list) + != list_count(bg_job_block_list)) + list_append(lists_of_lists, bg_job_block_list); + itr = list_iterator_create(lists_of_lists); while ((temp_list = (List)list_next(itr)) != NULL) { created++; @@ -454,6 +458,10 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, uint16_t geo[BA_SYSTEM_DIMENSIONS]; uint16_t tmp16 = (uint16_t)NO_VAL; + if(!test_only && (list_count(bg_list) > 0) + && (list_count(bg_list) == list_count(bg_job_block_list))) + return SLURM_ERROR; + select_g_sprint_jobinfo(job_ptr->select_jobinfo, buf, sizeof(buf), SELECT_PRINT_MIXED); @@ -475,7 +483,7 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, "unassigned"); if(job_ptr->num_procs < bluegene_bp_node_cnt) { i = procs_per_node/job_ptr->num_procs; - info("divide by %d",i); + debug2("divide by %d", i); } else i = 1; min_nodes *= bluegene_bp_node_cnt/i; @@ -490,6 +498,7 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, &geo); } else { + slurm_mutex_lock(&block_state_mutex); /* set the block id and info about block */ select_g_set_jobinfo(job_ptr->select_jobinfo, SELECT_DATA_BLOCK_ID, @@ -510,6 +519,7 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, select_g_set_jobinfo(job_ptr->select_jobinfo, SELECT_DATA_CONN_TYPE, &tmp16); + slurm_mutex_unlock(&block_state_mutex); } if(test_only) { select_g_set_jobinfo(job_ptr->select_jobinfo, diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c index a2aeaacf609f03d39c8780c372d29f08d0b28aea..1e7c774731d79aa98154b5b4125224d30bb144df 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_run.c +++ b/src/plugins/select/bluegene/plugin/bg_job_run.c @@ -103,9 +103,9 @@ static int _remove_job(db_job_id_t job_id) slurm_mutex_lock(&api_file_mutex); /* Find the job */ if ((rc = rm_get_job(job_id, &job_rec)) != STATUS_OK) { + slurm_mutex_unlock(&api_file_mutex); if (rc == JOB_NOT_FOUND) { debug("job %d removed from MMCS", job_id); - slurm_mutex_unlock(&api_file_mutex); return STATUS_OK; } @@ -283,7 +283,7 @@ static void _start_agent(bg_update_t *bg_update_ptr) slurm_mutex_unlock(&block_state_mutex); /* wait for all necessary blocks to be freed */ - while(num_block_to_free != num_block_freed) { + while(num_block_to_free > num_block_freed) { sleep(1); debug("got %d of %d freed", num_block_freed, @@ -686,9 +686,9 @@ extern int start_job(struct job_record *job_ptr) bg_record = find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id); if (bg_record) { + slurm_mutex_lock(&block_state_mutex); job_ptr->num_procs = (bg_record->cpus_per_bp * bg_record->bp_count); - slurm_mutex_lock(&block_state_mutex); bg_record->job_running = bg_update_ptr->job_id; if(!block_exist_in_list(bg_job_block_list, bg_record)) list_push(bg_job_block_list, bg_record); diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 30f449449a2170396b558c57c3fa3d7bd66b3774..5f1b2bc9b3b405f412befb293dadb5261394a726 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -645,8 +645,7 @@ extern int remove_all_users(char *bg_block_id, char *user_name) RM_PartitionNextUser, &user)) != STATUS_OK) { - error("rm_get_partition(%s): %s", - bg_block_id, + error("rm_get_data(RM_PartitionNextUser): %s", bg_err_str(rc)); returnc = REMOVE_USER_ERR; break; @@ -656,8 +655,7 @@ extern int remove_all_users(char *bg_block_id, char *user_name) RM_PartitionFirstUser, &user)) != STATUS_OK) { - error("rm_get_data(%s): %s", - bg_block_id, + error("rm_get_data(RM_PartitionFirstUser): %s", bg_err_str(rc)); returnc = REMOVE_USER_ERR; break; @@ -1104,15 +1102,15 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) || (bg_record->quarter == 0 && (bg_record->nodecard == (uint16_t) NO_VAL || bg_record->nodecard == 0)))) { + + for(i=0; i<BA_SYSTEM_DIMENSIONS; i++) + request->start[i] = bg_record->start[i]; debug2("allocating %d%d%d %d", bg_record->nodes, request->start[X], request->start[Y], request->start[Z], request->size); - - for(i=0; i<BA_SYSTEM_DIMENSIONS; i++) - request->start[i] = bg_record->start[i]; request->start_req = 1; rc = SLURM_SUCCESS; if (!allocate_block(request, results)){ @@ -1341,7 +1339,7 @@ extern int bg_free_block(bg_record_t *bg_record) && bg_record->state != RM_PARTITION_FREE && bg_record->state != RM_PARTITION_DEALLOCATING) { #ifdef HAVE_BG_FILES - debug2("pm_destroy %s",bg_record->bg_block_id); + debug("pm_destroy %s",bg_record->bg_block_id); slurm_mutex_lock(&api_file_mutex); rc = pm_destroy_partition(bg_record->bg_block_id); @@ -1352,17 +1350,19 @@ extern int bg_free_block(bg_record_t *bg_record) bg_record->bg_block_id); break; } else if(rc == INCOMPATIBLE_STATE) { - debug2("pm_destroy_partition(%s): %s " - "State = %d", - bg_record->bg_block_id, - bg_err_str(rc), - bg_record->state); - continue; + debug("pm_destroy_partition(%s): %s " + "State = %d", + bg_record->bg_block_id, + bg_err_str(rc), + bg_record->state); + break; + } else { + error("pm_destroy_partition(%s): %s " + "State = %d", + bg_record->bg_block_id, + bg_err_str(rc), + bg_record->state); } - error("pm_destroy_partition(%s): %s " - "State = %d", - bg_record->bg_block_id, - bg_err_str(rc), bg_record->state); } #else bg_record->state = RM_PARTITION_FREE; @@ -1472,9 +1472,12 @@ extern void *mult_destroy_block(void *args) bg_record->bg_block_id); term_jobs_on_block(bg_record->bg_block_id); - debug2("destroying %s", (char *)bg_record->bg_block_id); - if(bg_free_block(bg_record) == SLURM_ERROR) + debug("destroying %s", (char *)bg_record->bg_block_id); + if(bg_free_block(bg_record) == SLURM_ERROR) { + debug("there was an error"); goto already_here; + } + debug("done destroying"); remove_from_bg_list(bg_list, bg_record); #ifdef HAVE_BG_FILES @@ -1493,7 +1496,7 @@ extern void *mult_destroy_block(void *args) bg_err_str(rc)); } } else - debug("done\n"); + debug("done"); slurm_mutex_unlock(&api_file_mutex); #endif slurm_mutex_lock(&block_state_mutex);