diff --git a/NEWS b/NEWS index 0964c3506c4f09c9cb9766923a4837861b73c963..22e35d6ef8bdb982eb0ec92369f134f7e1d2fbfd 100644 --- a/NEWS +++ b/NEWS @@ -273,6 +273,14 @@ documents those changes that are of interest to users and admins. -- In accounting_storage/filetxt and accounting_storage/pgsql fix possible invalid memory reference when a job lacks a name. -- Give srun command an exit code of 1 if the prolog fails. + -- BLUEGENE - allows for checking nodecard states in the system instead + of midplane state so as to not down an entire midplane if you don't + have to. + -- BLUEGENE - fix creation of MESH blocks + -- BLUEGENE - on job cancellation we call jm_cancel_job and then wait 5.5 + minutes for the job to finish before sending SIGKILL to the job then wait + 30 seconds and fail the cancel if the job is still there. Before we + would send a SIGKILL right at the beginning. * Changes in SLURM 1.3.14 ========================= diff --git a/doc/man/man1/sreport.1 b/doc/man/man1/sreport.1 index 89d5b50d79fd2533b0c71b33dab130476ff428e0..3215a858ed20e19fa86a15cd4b76a72a346b80ce 100644 --- a/doc/man/man1/sreport.1 +++ b/doc/man/man1/sreport.1 @@ -263,7 +263,13 @@ List of accounts to use for the report Default is all. The SizesByAccount report only displays 1 hierarchical level. If accounts are specified the next layer of accounts under those specified will be displayed, not the accounts specified. In the SizesByAccount reports the default -for accounts is root. +for accounts is root. This explanation does not apply when ran with +the FlatView option. +.TP +.B FlatView +When used with the SizesbyAccount will not group accounts in a +hierarchical level, but print each account where jobs ran on a +separate line without any hierarchy. .TP .B GID=<OPT> List of group ids to include in report. Default is all. diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c index 7a99a1854238b408aa2381a028c3c1c4e1e0b5ad..379a1e38ad415bbca517b150bd536cda8df3fccb 100644 --- a/src/plugins/select/bluegene/block_allocator/block_allocator.c +++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c @@ -2895,7 +2895,7 @@ static int _append_geo(int *geometry, List geos, int rotate) /* * Fill in the paths and extra midplanes we need for the block. - * Basically copy the x path sent in with the start_list in each Y anx + * Basically copy the x path sent in with the start_list in each Y and * Z dimension filling in every midplane for the block and then * completing the Y and Z wiring, tying the whole block together. * @@ -2930,14 +2930,12 @@ static int _fill_in_coords(List results, List start_list, curr_switch = &check_node->axis_switch[X]; for(y=0; y<geometry[Y]; y++) { - if((check_node->coord[Y]+y) - >= DIM_SIZE[Y]) { + if((check_node->coord[Y]+y) >= DIM_SIZE[Y]) { rc = 0; goto failed; } for(z=0; z<geometry[Z]; z++) { - if((check_node->coord[Z]+z) - >= DIM_SIZE[Z]) { + if((check_node->coord[Z]+z) >= DIM_SIZE[Z]) { rc = 0; goto failed; } @@ -3305,7 +3303,7 @@ static int _find_yz_path(ba_node_t *ba_node, int *first, geometry[i2], i2, count); return 0; } - } else if(geometry[i2] == 1) { + } else if((geometry[i2] == 1) && (conn_type == SELECT_TORUS)) { /* FIX ME: This is put here because we got into a state where the Y dim was not being processed correctly. This will set up the @@ -4248,10 +4246,16 @@ static int _find_x_path(List results, ba_node_t *ba_node, /* we don't need to go any further */ if(x_size == 1) { - curr_switch->int_wire[source_port].used = 1; - curr_switch->int_wire[source_port].port_tar = target_port; - curr_switch->int_wire[target_port].used = 1; - curr_switch->int_wire[target_port].port_tar = source_port; + /* Only set this if Torus since mesh doesn't have any + * connections in this path */ + if(conn_type == SELECT_TORUS) { + curr_switch->int_wire[source_port].used = 1; + curr_switch->int_wire[source_port].port_tar = + target_port; + curr_switch->int_wire[target_port].used = 1; + curr_switch->int_wire[target_port].port_tar = + source_port; + } return 1; } diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c index 9f09a29f424f8c7fc09d91c9e25748fc4f78f89c..e5f3bba9972c43c5f3988f0c941dc8d971b2b099 100644 --- a/src/plugins/select/bluegene/plugin/bg_block_info.c +++ b/src/plugins/select/bluegene/plugin/bg_block_info.c @@ -212,8 +212,8 @@ extern void pack_block(bg_record_t *bg_record, Buf buffer) pack16((uint16_t)bg_record->conn_type, buffer); #ifdef HAVE_BGL pack16((uint16_t)bg_record->node_use, buffer); - pack16((uint16_t)bg_record->quarter, buffer); - pack16((uint16_t)bg_record->nodecard, buffer); + pack16((uint16_t)0, buffer); + pack16((uint16_t)0, buffer); #endif pack32((uint32_t)bg_record->node_cnt, buffer); pack_bit_fmt(bg_record->bitmap, buffer); diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index 483ccc8a2a0555d3ad777d4f1d20489737b20041..784d1604dde4eb5df25b943f2d0d54f1592cab9d 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -137,10 +137,16 @@ static void _rotate_geo(uint16_t *req_geometry, int rot_cnt) */ static int _bg_record_sort_aval_inc(bg_record_t* rec_a, bg_record_t* rec_b) { - if(rec_a->job_ptr && !rec_b->job_ptr) + if((rec_a->job_running == BLOCK_ERROR_STATE) + && (rec_b->job_running != BLOCK_ERROR_STATE)) + return 1; + else if((rec_a->job_running != BLOCK_ERROR_STATE) + && (rec_b->job_running == BLOCK_ERROR_STATE)) return -1; else if(!rec_a->job_ptr && rec_b->job_ptr) return 1; + else if(rec_a->job_ptr && !rec_b->job_ptr) + return -1; else if(rec_a->job_ptr && rec_b->job_ptr) { if(rec_a->job_ptr->start_time > rec_b->job_ptr->start_time) return 1; @@ -159,10 +165,16 @@ static int _bg_record_sort_aval_inc(bg_record_t* rec_a, bg_record_t* rec_b) */ static int _bg_record_sort_aval_dec(bg_record_t* rec_a, bg_record_t* rec_b) { - if(rec_a->job_ptr && !rec_b->job_ptr) + if((rec_a->job_running == BLOCK_ERROR_STATE) + && (rec_b->job_running != BLOCK_ERROR_STATE)) + return -1; + else if((rec_a->job_running != BLOCK_ERROR_STATE) + && (rec_b->job_running == BLOCK_ERROR_STATE)) return 1; else if(!rec_a->job_ptr && rec_b->job_ptr) return -1; + else if(rec_a->job_ptr && !rec_b->job_ptr) + return 1; else if(rec_a->job_ptr && rec_b->job_ptr) { if(rec_a->job_ptr->start_time > rec_b->job_ptr->start_time) return -1; @@ -1022,10 +1034,8 @@ static int _find_best_block_match(List block_list, "block %s in an error state " "because of bad bps.", bg_record->bg_block_id); - bg_record->job_running = - BLOCK_ERROR_STATE; - bg_record->state = RM_PARTITION_ERROR; - trigger_block_error(); + put_block_in_error_state( + bg_record, BLOCK_ERROR_STATE); continue; } } @@ -1094,14 +1104,29 @@ static int _find_best_block_match(List block_list, request.geometry[i] = req_geometry[i]; bg_record = list_pop(job_list); - if(bg_record) - debug2("taking off %d(%s) started " - "at %d ends at %d", - bg_record->job_running, - bg_record->bg_block_id, - bg_record->job_ptr->start_time, - bg_record->job_ptr->end_time); - else + if(bg_record) { + if(bg_record->job_ptr) + debug2("taking off %d(%s) " + "started at %d " + "ends at %d", + bg_record->job_running, + bg_record->bg_block_id, + bg_record->job_ptr-> + start_time, + bg_record->job_ptr-> + end_time); + else if(bg_record->job_running + == BLOCK_ERROR_STATE) + debug2("taking off (%s) " + "which is in an error " + "state", + bg_record->job_running, + bg_record->bg_block_id, + bg_record->job_ptr-> + start_time, + bg_record->job_ptr-> + end_time); + } else /* This means we didn't have any jobs to take off anymore so we are making @@ -1407,7 +1432,8 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, else starttime = bg_record->job_ptr->end_time; - } + } else if(bg_record->job_running == BLOCK_ERROR_STATE) + starttime = INFINITE; job_ptr->start_time = starttime; diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c index 4f9a78017023740f6f8eb510528ac489088286e7..b24e5b92cc15f5c44c11e8da70db48a758f47f00 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_run.c +++ b/src/plugins/select/bluegene/plugin/bg_job_run.c @@ -123,7 +123,6 @@ static int _remove_job(db_job_id_t job_id) for (i=0; i<MAX_POLL_RETRIES; i++) { if (i > 0) sleep(POLL_INTERVAL); - /* Find the job */ if ((rc = bridge_get_job(job_id, &job_rec)) != STATUS_OK) { @@ -159,19 +158,19 @@ static int _remove_job(db_job_id_t job_id) /* check the state and process accordingly */ if(job_state == RM_JOB_TERMINATED) return STATUS_OK; - else if(job_state == RM_JOB_DYING) + else if(job_state == RM_JOB_DYING) { + /* start sending sigkills for the last 5 tries */ + if(i == (MAX_POLL_RETRIES-5)) + (void) bridge_signal_job(job_id, SIGKILL); continue; - else if(job_state == RM_JOB_ERROR) { + } else if(job_state == RM_JOB_ERROR) { error("job %d is in a error state.", job_id); //free_bg_block(); return STATUS_OK; } - (void) bridge_signal_job(job_id, SIGKILL); rc = bridge_cancel_job(job_id); - /* it doesn't appear that this does anything. */ - // rc = bridge_remove_job(job_id); if (rc != STATUS_OK) { if (rc == JOB_NOT_FOUND) { @@ -187,8 +186,6 @@ static int _remove_job(db_job_id_t job_id) } } /* try once more... */ - /* it doesn't appear that this does anything. */ - // (void) bridge_remove_job(job_id); error("Failed to remove job %d from MMCS", job_id); return INTERNAL_ERROR; } diff --git a/src/plugins/select/bluegene/plugin/bg_record_functions.c b/src/plugins/select/bluegene/plugin/bg_record_functions.c index e4c572444554406c5008868298e28f5d8d5ab64c..6448ff07aba76a9173b68dfc5b466fd2f062d4ad 100644 --- a/src/plugins/select/bluegene/plugin/bg_record_functions.c +++ b/src/plugins/select/bluegene/plugin/bg_record_functions.c @@ -126,13 +126,13 @@ extern int block_exist_in_list(List my_list, bg_record_t *bg_record) && bit_equal(bg_record->ionode_bitmap, found_record->ionode_bitmap)) { if(bg_record->ionodes) - debug3("This block %s[%s] " + debug("This block %s[%s] " "is already in the list %s", bg_record->nodes, bg_record->ionodes, found_record->bg_block_id); else - debug3("This block %s " + debug("This block %s " "is already in the list %s", bg_record->nodes, found_record->bg_block_id); @@ -449,10 +449,6 @@ extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record) sec_record->job_ptr = fir_record->job_ptr; sec_record->cpu_cnt = fir_record->cpu_cnt; sec_record->node_cnt = fir_record->node_cnt; -#ifdef HAVE_BGL - sec_record->quarter = fir_record->quarter; - sec_record->nodecard = fir_record->nodecard; -#endif } /* @@ -478,17 +474,7 @@ extern int bg_record_cmpf_inc(bg_record_t* rec_a, bg_record_t* rec_b) else if (size_a > 0) return 1; } -#ifdef HAVE_BGL - if (rec_a->quarter < rec_b->quarter) - return -1; - else if (rec_a->quarter > rec_b->quarter) - return 1; - if(rec_a->nodecard < rec_b->nodecard) - return -1; - else if(rec_a->nodecard > rec_b->nodecard) - return 1; -#else if(!rec_a->ionode_bitmap || !rec_b->ionode_bitmap) return 0; @@ -496,7 +482,7 @@ extern int bg_record_cmpf_inc(bg_record_t* rec_a, bg_record_t* rec_b) return -1; else return 1; -#endif + return 0; } @@ -505,29 +491,25 @@ extern bg_record_t *find_bg_record_in_list(List my_list, char *bg_block_id) ListIterator itr; bg_record_t *bg_record = NULL; + xassert(my_list); + if(!bg_block_id) return NULL; - if(my_list) { - slurm_mutex_lock(&block_state_mutex); - itr = list_iterator_create(my_list); - while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { - if(bg_record->bg_block_id) - if (!strcmp(bg_record->bg_block_id, - bg_block_id)) - break; - } - list_iterator_destroy(itr); - slurm_mutex_unlock(&block_state_mutex); - if(bg_record) - return bg_record; - else - return NULL; - } else { - error("find_bg_record_in_list: no list"); - return NULL; + slurm_mutex_lock(&block_state_mutex); + itr = list_iterator_create(my_list); + while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { + if(bg_record->bg_block_id) + if (!strcmp(bg_record->bg_block_id, + bg_block_id)) + break; } - + list_iterator_destroy(itr); + slurm_mutex_unlock(&block_state_mutex); + if(bg_record) + return bg_record; + else + return NULL; } /* All changes to the bg_list target_name must @@ -659,59 +641,10 @@ end_it: sleep(1); } - slurm_mutex_lock(&block_state_mutex); - error("Setting Block %s to ERROR state.", bg_record->bg_block_id); - bg_record->job_running = BLOCK_ERROR_STATE; - bg_record->state = RM_PARTITION_ERROR; - remove_from_bg_list(bg_booted_block_list, bg_record); - slurm_mutex_unlock(&block_state_mutex); - trigger_block_error(); + put_block_in_error_state(bg_record, BLOCK_ERROR_STATE); return; } -#ifdef HAVE_BGL - -extern int set_ionodes(bg_record_t *bg_record) -{ - int i = 0; - int start_bit = 0; - int size = 0; - char bitstring[BITSIZE]; - - if(!bg_record) - return SLURM_ERROR; - /* set the bitmap blank here if it is a full node we don't - want anything set we also don't want the bg_record->ionodes set. - */ - bg_record->ionode_bitmap = bit_alloc(bluegene_numpsets); - if(bg_record->quarter == (uint16_t)NO_VAL) { - return SLURM_SUCCESS; - } - - start_bit = bluegene_quarter_ionode_cnt*bg_record->quarter; - - if(bg_record->nodecard != (uint16_t)NO_VAL - && bluegene_nodecard_ionode_cnt) { - start_bit += bluegene_nodecard_ionode_cnt*bg_record->nodecard; - size = bluegene_nodecard_ionode_cnt; - } else - size = bluegene_quarter_ionode_cnt; - size += start_bit; - - if(size == start_bit) { - error("start bit is the same as the end bit %d", size); - return SLURM_ERROR; - } - for(i=start_bit; i<size; i++) - bit_set(bg_record->ionode_bitmap, i); - - bit_fmt(bitstring, BITSIZE, bg_record->ionode_bitmap); - bg_record->ionodes = xstrdup(bitstring); - - return SLURM_SUCCESS; -} -#else - extern int set_ionodes(bg_record_t *bg_record, int io_start, int io_nodes) { char bitstring[BITSIZE]; @@ -727,9 +660,8 @@ extern int set_ionodes(bg_record_t *bg_record, int io_start, int io_nodes) return SLURM_SUCCESS; } -#endif - -extern int add_bg_record(List records, List used_nodes, blockreq_t *blockreq) +extern int add_bg_record(List records, List used_nodes, blockreq_t *blockreq, + bool no_check, bitoff_t io_start) { bg_record_t *bg_record = NULL; ba_node_t *ba_node = NULL; @@ -737,23 +669,15 @@ extern int add_bg_record(List records, List used_nodes, blockreq_t *blockreq) uid_t pw_uid; int i, len; int small_count = 0; -#ifdef HAVE_BGL - int node_cnt = 0; - uint16_t quarter = 0; - uint16_t nodecard = 0; - int small_size = 0; - bg_record_t *found_record = NULL; -#endif + if(!records) { fatal("add_bg_record: no records list given"); } bg_record = (bg_record_t*) xmalloc(sizeof(bg_record_t)); - bg_record->user_name = - xstrdup(bg_slurm_user_name); - bg_record->target_name = - xstrdup(bg_slurm_user_name); + bg_record->user_name = xstrdup(bg_slurm_user_name); + bg_record->target_name = xstrdup(bg_slurm_user_name); pw_uid = uid_from_string(bg_record->user_name); if(pw_uid == (uid_t) -1) { @@ -772,10 +696,8 @@ extern int add_bg_record(List records, List used_nodes, blockreq_t *blockreq) /* bg_record->boot_state = 0; Implicit */ /* bg_record->state = 0; Implicit */ #ifdef HAVE_BGL - bg_record->quarter = (uint16_t)NO_VAL; - bg_record->nodecard = (uint16_t)NO_VAL; debug2("asking for %s %d %d %s", - blockreq->block, blockreq->small128, blockreq->small32, + blockreq->block, blockreq->small32, blockreq->small128, convert_conn_type(blockreq->conn_type)); #else debug2("asking for %s %d %d %d %d %d %s", @@ -852,79 +774,8 @@ extern int add_bg_record(List records, List used_nodes, blockreq_t *blockreq) } } else { debug("adding a small block"); -#ifdef HAVE_BGL // remove this clause when other works. Only here to - // perserve old code - - /* if the ionode cnt for small32 is 0 then don't - allow a nodecard allocation - */ - if(!bluegene_nodecard_ionode_cnt) { - if(blockreq->small32) - fatal("There is an error in your " - "bluegene.conf file.\n" - "Can't create a 32 node block with " - "Numpsets=%u. (Try setting it to 64)", - bluegene_numpsets); - } - - if(blockreq->small32==0 && blockreq->small128==0) { - info("No specs given for this small block, " - "I am spliting this block into 4 128CnBlocks"); - blockreq->small128=4; - } - - i = (blockreq->small32*bluegene_nodecard_node_cnt) + - (blockreq->small128*bluegene_quarter_node_cnt); - if(i != bluegene_bp_node_cnt) - fatal("There is an error in your bluegene.conf file.\n" - "I am unable to request %d nodes consisting of " - "%u 32CnBlocks and\n%u 128CnBlocks in one " - "base partition with %u nodes.", - i, bluegene_bp_node_cnt, - blockreq->small32, blockreq->small128); - small_count = blockreq->small32+blockreq->small128; - /* Automatically create 4-way split if - * conn_type == SELECT_SMALL in bluegene.conf - * Here we go through each node listed and do the same thing - * for each node. - */ - itr = list_iterator_create(bg_record->bg_block_list); - while ((ba_node = list_next(itr)) != NULL) { - /* break base partition up into 16 parts */ - small_size = bluegene_bp_nodecard_cnt; - node_cnt = 0; - quarter = 0; - nodecard = 0; - for(i=0; i<small_count; i++) { - if(i == blockreq->small32) { - /* break base partition - up into 4 parts */ - small_size = 4; - } - - if(small_size == 4) - nodecard = (uint16_t)NO_VAL; - else - nodecard = i%4; - found_record = create_small_record(bg_record, - quarter, - nodecard); - - /* this needs to be an append so we - keep things in the order we got - them, they will be sorted later */ - list_append(records, found_record); - node_cnt += bluegene_bp_node_cnt/small_size; - if(node_cnt == 128) { - node_cnt = 0; - quarter++; - } - } - } - list_iterator_destroy(itr); - destroy_bg_record(bg_record); -#else // remove this when testing. Only here to perserve old code the - // code below is already for bgl + if(no_check) + goto no_check; /* if the ionode cnt for small32 is 0 then don't allow a sub quarter allocation */ @@ -1001,6 +852,7 @@ extern int add_bg_record(List records, List used_nodes, blockreq_t *blockreq) + blockreq->small128 + blockreq->small256; #endif + no_check: /* Automatically create 2-way split if * conn_type == SELECT_SMALL in bluegene.conf * Here we go through each node listed and do the same thing @@ -1009,17 +861,15 @@ extern int add_bg_record(List records, List used_nodes, blockreq_t *blockreq) itr = list_iterator_create(bg_record->bg_block_list); while ((ba_node = list_next(itr)) != NULL) { handle_small_record_request(records, blockreq, - bg_record, 0); + bg_record, io_start); } list_iterator_destroy(itr); destroy_bg_record(bg_record); -#endif // remove this when done testing } return SLURM_SUCCESS; } -#ifndef HAVE_BGL extern int handle_small_record_request(List records, blockreq_t *blockreq, bg_record_t *bg_record, bitoff_t start) { @@ -1103,7 +953,6 @@ extern int handle_small_record_request(List records, blockreq_t *blockreq, return SLURM_SUCCESS; } -#endif extern int format_node_name(bg_record_t *bg_record, char *buf, int buf_size) { @@ -1117,127 +966,250 @@ extern int format_node_name(bg_record_t *bg_record, char *buf, int buf_size) return SLURM_SUCCESS; } -extern int down_sub_node_blocks(int *coord, bitstr_t *ionode_bitmap) +extern int down_nodecard(char *bp_name, bitoff_t io_start) { List requests = NULL; List delete_list = NULL; - List error_list = NULL; ListIterator itr = NULL; - blockreq_t blockreq; - bg_record_t *bg_record = NULL, *found_record = NULL; - char *node_name = NULL; + bg_record_t *bg_record = NULL, *found_record = NULL, tmp_record; + bg_record_t *smallest_bg_record = NULL; struct node_record *node_ptr = NULL; int bp_bit = 0; + static int io_cnt = NO_VAL; + static int create_size = NO_VAL; + static blockreq_t blockreq; + int rc = SLURM_SUCCESS; + + xassert(bp_name); + + if(io_cnt == NO_VAL) { + io_cnt = 1; + /* Translate 1 nodecard count to ionode count */ + if((io_cnt *= bluegene_io_ratio)) + io_cnt--; + /* make sure we create something that is able to be + created */ + if(bluegene_smallest_block < bluegene_nodecard_node_cnt) + create_size = bluegene_nodecard_node_cnt; + else + create_size = bluegene_smallest_block; + } - xassert(coord); - - node_name = xstrdup_printf("%s%c%c%c", - bg_slurm_node_prefix, - alpha_num[coord[X]], - alpha_num[coord[Y]], - alpha_num[coord[Z]]); - node_ptr = find_node_record(node_name); + node_ptr = find_node_record(bp_name); if (!node_ptr) { - error ("down_sub_node_blocks: invalid node specified %s", - node_name); - xfree(node_name); + error ("down_sub_node_blocks: invalid node specified '%s'", + bp_name); return EINVAL; } bp_bit = (node_ptr - node_record_table_ptr); - - /* Here we need to add blocks that take up nodecards on this - midplane. Since Slurm only keeps track of midplanes - natively this is the only want to handle this case. - */ - requests = list_create(destroy_bg_record); memset(&blockreq, 0, sizeof(blockreq_t)); - - blockreq.block = node_name; + blockreq.conn_type = SELECT_SMALL; - blockreq.small32 = bluegene_bp_nodecard_cnt; + blockreq.block = bp_name; + + debug3("here setting %d of %d and %d-%d of %d", + bp_bit, node_record_count, io_start, + io_start+io_cnt, bluegene_numpsets); + + memset(&tmp_record, 0, sizeof(bg_record_t)); + tmp_record.bp_count = 1; + tmp_record.node_cnt = bluegene_nodecard_node_cnt; + tmp_record.bitmap = bit_alloc(node_record_count); + bit_set(tmp_record.bitmap, bp_bit); + + tmp_record.ionode_bitmap = bit_alloc(bluegene_numpsets); + bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt); - add_bg_record(requests, NULL, &blockreq); - slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_list); + while ((bg_record = list_next(itr))) { + if(!bit_test(bg_record->bitmap, bp_bit)) + continue; - error_list = list_create(NULL); - delete_list = list_create(NULL); - while((bg_record = list_pop(requests))) { - if(bit_overlap(bg_record->ionode_bitmap, ionode_bitmap)) { - /* we don't care about this one since it - wasn't set. - */ - destroy_bg_record(bg_record); + if(!blocks_overlap(bg_record, &tmp_record)) continue; - } + + if(bg_record->job_running > NO_JOB_RUNNING) + slurm_fail_job(bg_record->job_running); + + /* mark every one of these in an error state */ + if(bluegene_layout_mode != LAYOUT_DYNAMIC) { + if(!delete_list) + delete_list = list_create(NULL); + list_append(delete_list, bg_record); + continue; + } + + /* below is only for dynamic modes since there are + never overlapping blocks there */ + /* if the block is smaller than the create size just + continue on. + */ + if(bg_record->node_cnt < create_size) + continue; + + if(!smallest_bg_record || + (smallest_bg_record->node_cnt > bg_record->node_cnt)) + smallest_bg_record = bg_record; + } + list_iterator_destroy(itr); + slurm_mutex_unlock(&block_state_mutex); + + if(bluegene_layout_mode != LAYOUT_DYNAMIC) { + debug3("running non-dynamic mode"); + if(delete_list) { + /* don't lock here since it is handled inside + the put_block_in_error_state + */ + itr = list_iterator_create(delete_list); + while ((bg_record = list_next(itr))) { + /* we already handled this */ + if(bg_record->state == RM_PARTITION_ERROR) + continue; + + rc = put_block_in_error_state( + bg_record, BLOCK_ERROR_STATE); + } + list_iterator_destroy(itr); + list_destroy(delete_list); + goto cleanup; + } - list_iterator_reset(itr); - while((found_record = list_next(itr))) { - if(bit_equal(bg_record->bitmap, - found_record->bitmap) - && bit_equal(bg_record->ionode_bitmap, - found_record->ionode_bitmap)) { - break; - } + debug("didn't get a smallest block"); + if(!node_already_down(bp_name)) { + time_t now = time(NULL); + char reason[128], time_str[32]; + slurm_make_time_str(&now, time_str, + sizeof(time_str)); + snprintf(reason, sizeof(reason), + "select_bluegene: " + "nodecard down [SLURM@%s]", + time_str); + slurm_drain_nodes(bp_name, reason); + } + rc = SLURM_SUCCESS; + goto cleanup; + } + + + if(smallest_bg_record) { + debug("smallest block is %s", smallest_bg_record->bg_block_id); + if(smallest_bg_record->state == RM_PARTITION_ERROR) { + rc = SLURM_SUCCESS; + goto cleanup; } - if(found_record) { - debug2("block %s[%s] already there", - found_record->nodes, - found_record->ionodes); - /* we'll get this one later. We are just - checking which ones we have to add right now. - */ - if(found_record->job_running > NO_JOB_RUNNING) - slurm_fail_job(found_record->job_running); - list_append(error_list, found_record); - destroy_bg_record(bg_record); - continue; - } else if(bluegene_layout_mode != LAYOUT_DYNAMIC) { - bg_record_t *smallest_bg_record = NULL; - /* here we only want to see if we can find the - smallest overlapping thing and set it to an - error */ - /* don't add anything new to the list since we aren't - dynamic */ - list_iterator_reset(itr); - while((found_record = list_next(itr))) { - if(found_record->node_cnt > 1) - /* we don't care about - anything over 1 midplane */ - if(!blocks_overlap(bg_record, found_record)) { - debug2("block %s isn't part of %s", - found_record->bg_block_id, - bg_record->bg_block_id); - continue; - } + while(smallest_bg_record->job_running > NO_JOB_RUNNING) + sleep(1); + + if(smallest_bg_record->node_cnt == create_size) { + rc = put_block_in_error_state( + smallest_bg_record, BLOCK_ERROR_STATE); + goto cleanup; + } + + if(create_size > smallest_bg_record->node_cnt) { + /* we should never get here. This means we + * have a create_size that is bigger than a + * block that is already made. + */ + rc = put_block_in_error_state( + smallest_bg_record, BLOCK_ERROR_STATE); + goto cleanup; + } + debug3("node count is %d", smallest_bg_record->node_cnt); + switch(smallest_bg_record->node_cnt) { +#ifndef HAVE_BGL + case 64: + blockreq.small32 = 2; + break; + case 256: + blockreq.small32 = 8; + break; +#endif + case 128: + blockreq.small32 = 4; + break; + case 512: + blockreq.small32 = 16; + break; + default: + rc = SLURM_ERROR; + goto cleanup; + break; + } - if(smallest_bg_record || - (smallest_bg_record->cpu_cnt - > found_record->cpu_cnt)) - smallest_bg_record = found_record; + if(create_size != bluegene_nodecard_node_cnt) { + blockreq.small128 = blockreq.small32 / 4; + blockreq.small32 = 0; + } + /* set the start to be the same as the start of the + ionode_bitmap */ + io_start = bit_ffs(smallest_bg_record->ionode_bitmap); + } else { + switch(create_size) { +#ifndef HAVE_BGL + case 64: + blockreq.small64 = 8; + break; + case 256: + blockreq.small256 = 2; +#endif + case 32: + blockreq.small32 = 16; + break; + case 128: + blockreq.small128 = 4; + break; + case 512: + if(!node_already_down(bp_name)) { + time_t now = time(NULL); + char reason[128], time_str[32]; + slurm_make_time_str(&now, time_str, + sizeof(time_str)); + snprintf(reason, sizeof(reason), + "select_bluegene: " + "nodecard down [SLURM@%s]", + time_str); + slurm_drain_nodes(bp_name, reason); } + rc = SLURM_SUCCESS; + goto cleanup; + break; + default: + break; + } + /* since we don't have a block in this midplane + we need to start at the beginning. */ + io_start = 0; + /* we also need a bg_block to pretend to be the + smallest block that takes up the entire midplane. */ + } + + + /* Here we need to add blocks that take up nodecards on this + midplane. Since Slurm only keeps track of midplanes + natively this is the only want to handle this case. + */ + requests = list_create(destroy_bg_record); + add_bg_record(requests, NULL, &blockreq, 1, io_start); + + + delete_list = list_create(NULL); + while((bg_record = list_pop(requests))) { + slurm_mutex_lock(&block_state_mutex); + itr = list_iterator_create(bg_list); + while((found_record = list_next(itr))) { + if(!blocks_overlap(bg_record, found_record)) + continue; + list_push(delete_list, found_record); + list_remove(itr); + num_block_to_free++; + } + list_iterator_destroy(itr); + slurm_mutex_unlock(&block_state_mutex); - if(smallest_bg_record) { - if(smallest_bg_record->job_running - > NO_JOB_RUNNING) - slurm_fail_job(smallest_bg_record-> - job_running); - list_append(error_list, smallest_bg_record); - } else { - if(!node_already_down(node_name)) - ba_update_node_state( - &ba_system_ptr->grid[coord[X]] - [coord[Y]][coord[Z]], - NODE_STATE_DRAIN); - } - - destroy_bg_record(bg_record); - continue; - } - /* we need to add this record since it doesn't exist */ if(configure_block(bg_record) == SLURM_ERROR) { destroy_bg_record(bg_record); @@ -1250,59 +1222,151 @@ extern int down_sub_node_blocks(int *coord, bitstr_t *ionode_bitmap) "around bad nodecards", bg_record->bg_block_id); print_bg_record(bg_record); + slurm_mutex_lock(&block_state_mutex); list_append(bg_list, bg_record); - list_append(error_list, bg_record); + slurm_mutex_unlock(&block_state_mutex); + if(bit_overlap(bg_record->ionode_bitmap, + tmp_record.ionode_bitmap)) { + /* here we know the error block doesn't exist + so just set the state here */ + rc = put_block_in_error_state( + bg_record, BLOCK_ERROR_STATE); + } } + list_destroy(requests); - /* remove overlapping blocks */ - while((found_record = list_pop(error_list))) { - if(found_record->job_running == BLOCK_ERROR_STATE) - continue; - error("Setting block %s to error state " - "because of failed hardware.", found_record->bg_block_id); - found_record->job_running = BLOCK_ERROR_STATE; - found_record->state = RM_PARTITION_ERROR; - trigger_block_error(); - - /* we have to check them all just to make sure no - small blocks are there - */ - list_iterator_reset(itr); - while((bg_record = list_next(itr))) { - if(found_record == bg_record) - continue; - if(!blocks_overlap(bg_record, found_record)) { - debug2("block %s isn't part of %s", - found_record->bg_block_id, - bg_record->bg_block_id); - continue; - } - debug2("removing block %s because there is something " - "wrong with part of the base partition", - found_record->bg_block_id); - if(found_record->job_running > NO_JOB_RUNNING) - slurm_fail_job(found_record->job_running); - - /* don't remove any blocks if not dynamic */ - if(bluegene_layout_mode != LAYOUT_DYNAMIC) - continue; - list_push(delete_list, found_record); - list_remove(itr); - num_block_to_free++; - } - } - list_iterator_destroy(itr); + slurm_mutex_lock(&block_state_mutex); free_block_list(delete_list); list_destroy(delete_list); + sort_bg_record_inc_size(bg_list); slurm_mutex_unlock(&block_state_mutex); + last_bg_update = time(NULL); + +cleanup: + FREE_NULL_BITMAP(tmp_record.bitmap); + FREE_NULL_BITMAP(tmp_record.ionode_bitmap); + + return rc; + +} - list_destroy(error_list); - FREE_NULL_BITMAP(ionode_bitmap); +extern int up_nodecard(char *bp_name, bitstr_t *ionode_bitmap) +{ + ListIterator itr = NULL; + bg_record_t *bg_record = NULL; + struct node_record *node_ptr = NULL; + int bp_bit = 0; + int ret = 0; + + xassert(bp_name); + xassert(ionode_bitmap); + + node_ptr = find_node_record(bp_name); + if (!node_ptr) { + error ("down_sub_node_blocks: invalid node specified %s", + bp_name); + return EINVAL; + } + bp_bit = (node_ptr - node_record_table_ptr); + + slurm_mutex_lock(&block_state_mutex); + itr = list_iterator_create(bg_list); + while((bg_record = list_next(itr))) { + if(bg_record->job_running != BLOCK_ERROR_STATE) + continue; + if(!bit_test(bg_record->bitmap, bp_bit)) + continue; - xfree(node_name); - last_bg_update = time(NULL); + if(!bit_overlap(bg_record->ionode_bitmap, ionode_bitmap)) { + continue; + } + resume_block(bg_record); + } + list_iterator_destroy(itr); + slurm_mutex_unlock(&block_state_mutex); + + /* FIX ME: This needs to call the opposite of + slurm_drain_nodes which does not yet exist. + */ + if((ret = node_already_down(bp_name))) { + /* means it was drained */ + if(ret == 2) { + /* debug("node %s put back into service after " */ +/* "being in an error state", */ +/* bp_name); */ + } + } + return SLURM_SUCCESS; +} + +extern int put_block_in_error_state(bg_record_t *bg_record, int state) +{ + uid_t pw_uid; + + xassert(bg_record); + /* Since we are putting this block in an error state we need + to wait for the job to be removed. We don't really + need to free the block though since we may just + want it to be in an error state for some reason. */ + while(bg_record->job_running > NO_JOB_RUNNING) + sleep(1); + + error("Setting Block %s to ERROR state.", bg_record->bg_block_id); + /* we add the block to these lists so we don't try to schedule + on them. */ + if(!block_ptr_exist_in_list(bg_job_block_list, bg_record)) { + list_push(bg_job_block_list, bg_record); + num_unused_cpus -= bg_record->cpu_cnt; + } + if(!block_ptr_exist_in_list(bg_booted_block_list, bg_record)) + list_push(bg_booted_block_list, bg_record); + + slurm_mutex_lock(&block_state_mutex); + bg_record->job_running = state; + bg_record->state = RM_PARTITION_ERROR; + + xfree(bg_record->user_name); + xfree(bg_record->target_name); + bg_record->user_name = xstrdup(bg_slurm_user_name); + bg_record->target_name = xstrdup(bg_slurm_user_name); + + pw_uid = uid_from_string(bg_record->user_name); + if(pw_uid == (uid_t) -1) { + error("No such user: %s", bg_record->user_name); + } else { + bg_record->user_uid = pw_uid; + } + slurm_mutex_unlock(&block_state_mutex); + + trigger_block_error(); + last_bg_update = time(NULL); + + return SLURM_SUCCESS; +} + +/* block_state_mutex should be locked before calling */ +extern int resume_block(bg_record_t *bg_record) +{ + xassert(bg_record); + + if(bg_record->job_running >= NO_JOB_RUNNING) + return SLURM_SUCCESS; + + debug("block %s put back into service after " + "being in an error state", + bg_record->bg_block_id); + + if(remove_from_bg_list(bg_job_block_list, bg_record) == SLURM_SUCCESS) + num_unused_cpus += bg_record->cpu_cnt; + remove_from_bg_list(bg_booted_block_list, bg_record); + + bg_record->job_running = NO_JOB_RUNNING; + bg_record->state = RM_PARTITION_FREE; + last_bg_update = time(NULL); + + return SLURM_SUCCESS; } /************************* local functions ***************************/ diff --git a/src/plugins/select/bluegene/plugin/bg_record_functions.h b/src/plugins/select/bluegene/plugin/bg_record_functions.h index 0c95f87c733b01bd412558d498efc172676db3fc..12bf010283c3ae7724a3b46141249653a09f0d67 100644 --- a/src/plugins/select/bluegene/plugin/bg_record_functions.h +++ b/src/plugins/select/bluegene/plugin/bg_record_functions.h @@ -102,10 +102,6 @@ typedef struct bg_record { uint32_t cpu_cnt; /* count of cpus per block */ uint32_t node_cnt; /* count of cnodes per block */ #ifdef HAVE_BGL - uint16_t quarter; /* used for small blocks - determine quarter of BP */ - uint16_t nodecard; /* used for small blocks - determine nodecard of quarter */ char *blrtsimage; /* BlrtsImage for this block */ #endif char *linuximage; /* LinuxImage/CnloadImage for @@ -136,17 +132,17 @@ extern bg_record_t *find_bg_record_in_list(List my_list, char *bg_block_id); extern int update_block_user(bg_record_t *bg_block_id, int set); extern void drain_as_needed(bg_record_t *bg_record, char *reason); -#ifdef HAVE_BGL -extern int set_ionodes(bg_record_t *bg_record); -#else extern int set_ionodes(bg_record_t *bg_record, int io_start, int io_nodes); -#endif -extern int add_bg_record(List records, List used_nodes, blockreq_t *blockreq); +extern int add_bg_record(List records, List used_nodes, blockreq_t *blockreq, + bool no_check, bitoff_t io_start); extern int handle_small_record_request(List records, blockreq_t *blockreq, bg_record_t *bg_record, bitoff_t start); extern int format_node_name(bg_record_t *bg_record, char *buf, int buf_size); -extern int down_sub_node_blocks(int *coord, bitstr_t *ionode_bitmap); +extern int down_nodecard(char *bp_name, bitoff_t io_start); +extern int up_nodecard(char *bp_name, bitstr_t *ionode_bitmap); +extern int put_block_in_error_state(bg_record_t *bg_record, int state); +extern int resume_block(bg_record_t *bg_record); #endif /* _BLUEGENE_BG_RECORD_FUNCTIONS_H_ */ diff --git a/src/plugins/select/bluegene/plugin/bg_switch_connections.c b/src/plugins/select/bluegene/plugin/bg_switch_connections.c index 53cb449c67a68e44371168c851df45eaf6201e4a..acf288612906482c208a56353f08cbd2af1684c6 100644 --- a/src/plugins/select/bluegene/plugin/bg_switch_connections.c +++ b/src/plugins/select/bluegene/plugin/bg_switch_connections.c @@ -323,187 +323,6 @@ static int _used_switches(ba_node_t* ba_node) return switch_count; } -#ifdef HAVE_BGL -extern int configure_small_block(bg_record_t *bg_record) -{ - int rc = SLURM_SUCCESS; -#ifdef HAVE_BG_FILES - bool small = true; - ba_node_t* ba_node = NULL; - rm_BP_t *curr_bp = NULL; - rm_bp_id_t bp_id = NULL; - int num_ncards = 0; - rm_nodecard_t *ncard; - rm_nodecard_list_t *ncard_list = NULL; - rm_quarter_t quarter; - int num, i; -#endif - if(bg_record->bp_count != 1) { - error("Requesting small block with %d bps, needs to be 1.", - bg_record->bp_count); - return SLURM_ERROR; - } - -#ifdef HAVE_BG_FILES - /* set that we are doing a small block */ - - if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionSmall, - &small)) != STATUS_OK) { - - fatal("bridge_set_data(RM_PartitionPsetsPerBP)", - bg_err_str(rc)); - } - - num_ncards = bg_record->node_cnt/bluegene_nodecard_node_cnt; - if(num_ncards < 1) - num_ncards = 1; - - if ((rc = bridge_set_data(bg_record->bg_block, - RM_PartitionNodeCardNum, - &num_ncards)) - != STATUS_OK) { - - fatal("bridge_set_data: RM_PartitionBPNum: %s", - bg_err_str(rc)); - } - - - ba_node = list_peek(bg_record->bg_block_list); - - if (_get_bp_by_location(bg, ba_node->coord, &curr_bp) - == SLURM_ERROR) { - fatal("_get_bp_by_location()"); - } - - /* Set the one BP */ - - if ((rc = bridge_set_data(bg_record->bg_block, - RM_PartitionBPNum, - &bg_record->bp_count)) - != STATUS_OK) { - - fatal("bridge_set_data: RM_PartitionBPNum: %s", - bg_err_str(rc)); - return SLURM_ERROR; - } - if ((rc = bridge_set_data(bg_record->bg_block, - RM_PartitionFirstBP, - curr_bp)) - != STATUS_OK) { - - fatal("bridge_set_data(" - "BRIDGE_PartitionFirstBP): %s", - bg_err_str(rc)); - return SLURM_ERROR; - } - - - /* find the bp_id of the bp to get the small32 */ - if ((rc = bridge_get_data(curr_bp, RM_BPID, &bp_id)) - != STATUS_OK) { - error("bridge_get_data(): %d", rc); - return SLURM_ERROR; - } - - - if(!bp_id) { - error("No BP ID was returned from database"); - return SLURM_ERROR; - } - - if ((rc = bridge_get_nodecards(bp_id, &ncard_list)) - != STATUS_OK) { - error("bridge_get_nodecards(%s): %d", - bp_id, rc); - free(bp_id); - return SLURM_ERROR; - } - free(bp_id); - - - if((rc = bridge_get_data(ncard_list, RM_NodeCardListSize, &num)) - != STATUS_OK) { - error("bridge_get_data(RM_NodeCardListSize): %s", - bg_err_str(rc)); - return SLURM_ERROR; - } - num_ncards = 0; - for(i=0; i<num; i++) { - if (i) { - if ((rc = bridge_get_data(ncard_list, - RM_NodeCardListNext, - &ncard)) != STATUS_OK) { - error("bridge_get_data" - "(RM_NodeCardListNext): %s", - rc); - rc = SLURM_ERROR; - goto cleanup; - } - } else { - if ((rc = bridge_get_data(ncard_list, - RM_NodeCardListFirst, - &ncard)) != STATUS_OK) { - error("bridge_get_data" - "(RM_NodeCardListFirst): %s", - rc); - rc = SLURM_ERROR; - goto cleanup; - } - } - - if ((rc = bridge_get_data(ncard, - RM_NodeCardQuarter, - &quarter)) != STATUS_OK) { - error("bridge_get_data(RM_NodeCardQuarter): %d",rc); - rc = SLURM_ERROR; - goto cleanup; - } - if(bg_record->quarter != quarter) - continue; - if(bg_record->nodecard != (uint16_t) NO_VAL) { - if(bg_record->nodecard != (i%4)) - continue; - } - - - if (num_ncards) { - if ((rc = bridge_set_data(bg_record->bg_block, - RM_PartitionNextNodeCard, - ncard)) - != STATUS_OK) { - - fatal("bridge_set_data(" - "RM_PartitionNextNodeCard): %s", - bg_err_str(rc)); - } - } else { - if ((rc = bridge_set_data(bg_record->bg_block, - RM_PartitionFirstNodeCard, - ncard)) - != STATUS_OK) { - - fatal("bridge_set_data(" - "RM_PartitionFirstNodeCard): %s", - bg_err_str(rc)); - } - } - - num_ncards++; - if(num_ncards == 4) - break; - } -cleanup: - if ((rc = bridge_free_nodecard_list(ncard_list)) != STATUS_OK) { - error("bridge_free_nodecard_list(): %s", bg_err_str(rc)); - return SLURM_ERROR; - } -#endif - debug2("making the small block"); - return rc; -} - -#else - extern int configure_small_block(bg_record_t *bg_record) { int rc = SLURM_SUCCESS; @@ -512,14 +331,13 @@ extern int configure_small_block(bg_record_t *bg_record) ba_node_t* ba_node = NULL; rm_BP_t *curr_bp = NULL; rm_bp_id_t bp_id = NULL; +#ifndef HAVE_BGL rm_nodecard_id_t nc_char = NULL; +#endif int nc_id = 0; - int num_ncards = 0, sub_nodecard = 0, ionode_card = 0; + int num_ncards = 0, sub_nodecard = 0, ionode_card = 0, nc_count = 0; rm_nodecard_t *ncard; rm_nodecard_list_t *ncard_list = NULL; -#ifdef HAVE_BGL - rm_quarter_t quarter; -#endif int num, i; int use_nc[bluegene_bp_nodecard_cnt]; double nc_pos = 0; @@ -530,7 +348,8 @@ extern int configure_small_block(bg_record_t *bg_record) bg_record->bp_count); return SLURM_ERROR; } - +/* info("configuring small block on ionodes %s out of %d ncs", */ +/* bg_record->ionodes, bluegene_bp_nodecard_cnt); */ #ifdef HAVE_BG_FILES /* set that we are doing a small block */ if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionSmall, @@ -632,7 +451,12 @@ extern int configure_small_block(bg_record_t *bg_record) bg_err_str(rc)); return SLURM_ERROR; } - num_ncards = 0; + if(num_ncards > num) { + error("You requested more (%d > %d) nodecards " + "than are available on this block %s", + num_ncards, num, bg_record->nodes); + } + for(i=0; i<num; i++) { if (i) { if ((rc = bridge_get_data(ncard_list, @@ -656,6 +480,15 @@ extern int configure_small_block(bg_record_t *bg_record) } } +#ifdef HAVE_BGL + /* on BG/L we assume the order never changes when the + system is up. This could change when a reboot of + the system happens, but that should be rare. + */ + nc_id = i; + if(!use_nc[i]) + continue; +#else if ((rc = bridge_get_data(ncard, RM_NodeCardID, &nc_char)) != STATUS_OK) { @@ -750,9 +583,9 @@ extern int configure_small_block(bg_record_t *bg_record) } } free(nc_char); +#endif - - if (num_ncards) { + if (nc_count) { if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionNextNodeCard, ncard)) @@ -778,8 +611,8 @@ extern int configure_small_block(bg_record_t *bg_record) } } - num_ncards++; - + nc_count++; +#ifndef HAVE_BGL if(sub_nodecard) { if((rc = bridge_free_nodecard(ncard)) != STATUS_OK) { error("bridge_free_nodecard(): %s", @@ -788,6 +621,9 @@ extern int configure_small_block(bg_record_t *bg_record) goto cleanup; } } +#endif + if(nc_count == num_ncards) + break; } cleanup: if ((rc = bridge_free_nodecard_list(ncard_list)) != STATUS_OK) { @@ -798,7 +634,6 @@ cleanup: debug2("making the small block"); return rc; } -#endif /** * connect the given switch up with the given connections diff --git a/src/plugins/select/bluegene/plugin/block_sys.c b/src/plugins/select/bluegene/plugin/block_sys.c index 9a4c9e748a1a49abd5b77e34dbfeed892c938bab..22e176b14ba58f9bcaa0968055ab3196178f80e6 100755 --- a/src/plugins/select/bluegene/plugin/block_sys.c +++ b/src/plugins/select/bluegene/plugin/block_sys.c @@ -270,8 +270,7 @@ static int _post_allocate(bg_record_t *bg_record) #ifdef HAVE_BG_FILES #ifdef HAVE_BGL -static int _find_nodecard(bg_record_t *bg_record, - rm_partition_t *block_ptr) +static int _find_nodecard(rm_partition_t *block_ptr, int *nc_id) { char *my_card_name = NULL; char *card_name = NULL; @@ -283,6 +282,9 @@ static int _find_nodecard(bg_record_t *bg_record, rm_nodecard_t *ncard = NULL; rm_BP_t *curr_bp = NULL; + xassert(block_ptr); + xassert(nc_id); + if((rc = bridge_get_data(block_ptr, RM_PartitionFirstNodeCard, &ncard)) @@ -357,12 +359,12 @@ static int _find_nodecard(bg_record_t *bg_record, rc = SLURM_ERROR; goto cleanup; } - if(strcmp(my_card_name,card_name)) { + if(strcmp(my_card_name, card_name)) { free(card_name); continue; } free(card_name); - bg_record->nodecard = (i%4); + (*nc_id) = i; break; } cleanup: @@ -397,7 +399,7 @@ int read_bg_blocks() { int rc = SLURM_SUCCESS; - int bp_cnt, i; + int bp_cnt, i, nc_cnt, io_cnt; rm_element_t *bp_ptr = NULL; rm_bp_id_t bpid; rm_partition_t *block_ptr = NULL; @@ -412,11 +414,8 @@ int read_bg_blocks() rm_partition_list_t *block_list = NULL; rm_partition_state_flag_t state = PARTITION_ALL_FLAG; rm_nodecard_t *ncard = NULL; -#ifdef HAVE_BGL - rm_quarter_t quarter; -#else int nc_id, io_start; -#endif + bool small = false; hostlist_t hostlist; /* expanded form of hosts */ @@ -493,10 +492,7 @@ int read_bg_blocks() free(tmp_char); bg_record->state = NO_VAL; -#ifdef HAVE_BGL - bg_record->quarter = (uint16_t) NO_VAL; - bg_record->nodecard = (uint16_t) NO_VAL; -#else +#ifndef HAVE_BGL if ((rc = bridge_get_data(block_ptr, RM_PartitionSize, &bp_cnt)) @@ -588,7 +584,7 @@ int read_bg_blocks() if((rc = bridge_get_data(block_ptr, RM_PartitionNodeCardNum, - &i)) + &nc_cnt)) != STATUS_OK) { error("bridge_get_data(" "RM_PartitionNodeCardNum): %s", @@ -596,33 +592,31 @@ int read_bg_blocks() goto clean_up; } #ifdef HAVE_BGL - if(i == 1) { - _find_nodecard(bg_record, block_ptr); - i = bluegene_bp_nodecard_cnt; - } + /* Translate nodecard count to ionode count */ + if((io_cnt = nc_cnt * bluegene_io_ratio)) + io_cnt--; + + nc_id = 0; + if(nc_cnt == 1) + _find_nodecard(block_ptr, &nc_id); + + bg_record->node_cnt = + nc_cnt * bluegene_nodecard_node_cnt; + bg_record->cpu_cnt = + bluegene_proc_ratio * bg_record->node_cnt; + if ((rc = bridge_get_data(ncard, RM_NodeCardQuarter, - &quarter)) != STATUS_OK) { + &io_start)) != STATUS_OK) { error("bridge_get_data(CardQuarter): %d",rc); goto clean_up; } - - bg_record->quarter = quarter; - - debug3("%s is in quarter %d nodecard %d", - bg_record->bg_block_id, - bg_record->quarter, - bg_record->nodecard); - bg_record->cpu_cnt = procs_per_node/i; - bg_record->node_cnt = bluegene_bp_node_cnt/i; - if(set_ionodes(bg_record) == SLURM_ERROR) - error("couldn't create ionode_bitmap " - "for %d.%d", - bg_record->quarter, bg_record->nodecard); + io_start *= bluegene_quarter_ionode_cnt; + io_start += bluegene_nodecard_ionode_cnt * (nc_id%4); #else /* Translate nodecard count to ionode count */ - if((i *= bluegene_io_ratio)) - i--; + if((io_cnt = nc_cnt * bluegene_io_ratio)) + io_cnt--; if ((rc = bridge_get_data(ncard, RM_NodeCardID, @@ -672,14 +666,18 @@ int read_bg_blocks() free(tmp_char); /* make sure i is 0 since we are only using * 1 ionode */ - i = 0; + io_cnt = 0; } +#endif - if(set_ionodes(bg_record, io_start, i) == SLURM_ERROR) + if(set_ionodes(bg_record, io_start, io_cnt) + == SLURM_ERROR) error("couldn't create ionode_bitmap " "for ionodes %d to %d", - io_start, io_start+i); -#endif + io_start, io_start+io_cnt); + debug3("%s uses ionodes %s", + bg_record->bg_block_id, + bg_record->ionodes); } else { #ifdef HAVE_BGL bg_record->cpu_cnt = procs_per_node @@ -701,8 +699,7 @@ int read_bg_blocks() don't want the bg_record->ionodes set. */ bg_record->ionode_bitmap = bit_alloc(bluegene_numpsets); - } - + } bg_record->bg_block_list = get_and_set_block_wiring(bg_record->bg_block_id); @@ -966,7 +963,7 @@ int read_bg_blocks() return rc; } -#else +#endif extern int load_state_file(char *dir_name) { @@ -1059,6 +1056,27 @@ extern int load_state_file(char *dir_name) error("select_p_state_restore: problem unpacking node_info"); goto unpack_error; } + +#ifdef HAVE_BG_FILES + for (i=0; i<node_select_ptr->record_count; i++) { + bg_info_record = &(node_select_ptr->bg_info_array[i]); + + /* we only care about the states we need here + * everthing else should have been set up already */ + if(bg_info_record->state == RM_PARTITION_ERROR) { + if((bg_record = find_bg_record_in_list( + bg_curr_block_list, + bg_info_record->bg_block_id))) + put_block_in_error_state( + bg_record, BLOCK_ERROR_STATE); + } + } + + select_g_free_node_info(&node_select_ptr); + free_buf(buffer); + return SLURM_SUCCESS; +#endif + slurm_mutex_lock(&block_state_mutex); reset_ba_system(false); @@ -1124,13 +1142,10 @@ extern int load_state_file(char *dir_name) xstrdup(bg_info_record->ionodes); bg_record->ionode_bitmap = bit_copy(ionode_bitmap); bg_record->state = bg_info_record->state; -#ifdef HAVE_BGL - bg_record->quarter = bg_info_record->quarter; - bg_record->nodecard = bg_info_record->nodecard; -#endif - if(bg_info_record->state == RM_PARTITION_ERROR) - bg_record->job_running = BLOCK_ERROR_STATE; - else + + if(bg_info_record->state == RM_PARTITION_ERROR) { + put_block_in_error_state(bg_record, BLOCK_ERROR_STATE); + } else bg_record->job_running = NO_JOB_RUNNING; bg_record->bp_count = bit_size(node_bitmap); bg_record->node_cnt = bg_info_record->node_cnt; @@ -1241,5 +1256,3 @@ unpack_error: free_buf(buffer); return SLURM_FAILURE; } - -#endif diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 48bdca0d6894598efb5bcc5507c755f82b08cf15..5069842a26885992d42efe9a32acc688a4258b26 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -41,7 +41,7 @@ #include "defined_block.h" #include <stdio.h> -#define MMCS_POLL_TIME 120 /* poll MMCS for down switches and nodes +#define MMCS_POLL_TIME 30 /* poll MMCS for down switches and nodes * every 120 secs */ #define BG_POLL_TIME 0 /* poll bg blocks every 3 secs */ @@ -414,22 +414,18 @@ extern void sort_bg_record_inc_size(List records){ } /* - * bluegene_agent - detached thread periodically updates status of - * bluegene nodes. + * block_agent - thread periodically updates status of + * bluegene blocks. * - * NOTE: I don't grab any locks here because slurm_drain_nodes grabs - * the necessary locks. */ -extern void *bluegene_agent(void *args) +extern void *block_agent(void *args) { - static time_t last_mmcs_test; static time_t last_bg_test; int rc; + time_t now = time(NULL); - last_mmcs_test = time(NULL) + MMCS_POLL_TIME; - last_bg_test = time(NULL) + BG_POLL_TIME; + last_bg_test = now - BG_POLL_TIME; while (!agent_fini) { - time_t now = time(NULL); if (difftime(now, last_bg_test) >= BG_POLL_TIME) { if (agent_fini) /* don't bother */ @@ -449,16 +445,38 @@ extern void *bluegene_agent(void *args) "update_block_list 2"); } } + now = time(NULL); } + + sleep(1); + } + return NULL; +} +/* + * state_agent - thread periodically updates status of + * bluegene nodes. + * + */ +extern void *state_agent(void *args) +{ + static time_t last_mmcs_test; + time_t now = time(NULL); + + last_mmcs_test = now - MMCS_POLL_TIME; + while (!agent_fini) { if (difftime(now, last_mmcs_test) >= MMCS_POLL_TIME) { if (agent_fini) /* don't bother */ break; /* quit now */ - last_mmcs_test = now; - test_mmcs_failures(); /* can run for a while */ - } + if(blocks_are_created) { + last_mmcs_test = now; + /* can run for a while */ + test_mmcs_failures(); + } + } sleep(1); + now = time(NULL); } return NULL; } @@ -726,6 +744,7 @@ extern void *mult_destroy_block(void *args) slurm_mutex_lock(&block_state_mutex); destroy_bg_record(bg_record); slurm_mutex_unlock(&block_state_mutex); + last_bg_update = time(NULL); debug2("destroyed"); already_here: @@ -1265,7 +1284,7 @@ no_calc: } for (i = 0; i < count; i++) { - add_bg_record(bg_list, NULL, blockreq_array[i]); + add_bg_record(bg_list, NULL, blockreq_array[i], 0, 0); } } s_p_hashtbl_destroy(tbl); @@ -1315,6 +1334,7 @@ extern int validate_current_blocks(char *dir) list_destroy(bg_found_block_list); bg_found_block_list = NULL; } + last_bg_update = time(NULL); blocks_are_created = 1; sort_bg_record_inc_size(bg_list); @@ -1385,6 +1405,10 @@ static int _validate_config_nodes(List *bg_found_block_list, char *dir) * happens in the state load before this in emulation mode */ if (read_bg_blocks() == SLURM_ERROR) return SLURM_ERROR; + /* since we only care about error states here we don't care + about the return code this must be done after the bg_list + is created */ + load_state_file(dir); #else /* read in state from last run. Only for emulation mode */ if ((rc = load_state_file(dir)) != SLURM_SUCCESS) @@ -1409,13 +1433,12 @@ static int _validate_config_nodes(List *bg_found_block_list, char *dir) while ((init_bg_record = list_next(itr_curr))) { if (strcasecmp(bg_record->nodes, init_bg_record->nodes)) continue; /* wrong nodes */ + if(!bit_equal(bg_record->ionode_bitmap, + init_bg_record->ionode_bitmap)) + continue; #ifdef HAVE_BGL if (bg_record->conn_type != init_bg_record->conn_type) continue; /* wrong conn_type */ - if(bg_record->quarter != init_bg_record->quarter) - continue; /* wrong quart */ - if(bg_record->nodecard != init_bg_record->nodecard) - continue; /* wrong nodecard */ if(bg_record->blrtsimage && strcasecmp(bg_record->blrtsimage, init_bg_record->blrtsimage)) @@ -1425,9 +1448,6 @@ static int _validate_config_nodes(List *bg_found_block_list, char *dir) && ((bg_record->conn_type < SELECT_SMALL) && (init_bg_record->conn_type < SELECT_SMALL))) continue; /* wrong conn_type */ - if(!bit_equal(bg_record->ionode_bitmap, - init_bg_record->ionode_bitmap)) - continue; #endif if(bg_record->linuximage && strcasecmp(bg_record->linuximage, diff --git a/src/plugins/select/bluegene/plugin/bluegene.h b/src/plugins/select/bluegene/plugin/bluegene.h index f617d4ddabe5d8dea1a38473c1bcd0483cb36a08..f7ce507c3a7f87fb53e0fa97959d6ff3a0503750 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.h +++ b/src/plugins/select/bluegene/plugin/bluegene.h @@ -103,6 +103,7 @@ extern int num_unused_cpus; #define MAX_PTHREAD_RETRIES 1 #define BLOCK_ERROR_STATE -3 +#define ADMIN_ERROR_STATE -4 #define NO_JOB_RUNNING -1 #define MAX_AGENT_COUNT 30 #define BUFSIZE 4096 @@ -144,9 +145,13 @@ extern char *convert_node_use(rm_partition_mode_t pt); /* sort a list of bg_records by size (node count) */ extern void sort_bg_record_inc_size(List records); -/* bluegene_agent - detached thread periodically tests status of bluegene - * nodes and switches */ -extern void *bluegene_agent(void *args); +/* block_agent - detached thread periodically tests status of bluegene + * blocks */ +extern void *block_agent(void *args); + +/* state_agent - thread periodically tests status of bluegene + * nodes, nodecards, and switches */ +extern void *state_agent(void *args); extern int bg_free_block(bg_record_t *bg_record); diff --git a/src/plugins/select/bluegene/plugin/defined_block.c b/src/plugins/select/bluegene/plugin/defined_block.c index 350808c955e757762b728ed94b44d855821a4378..fcb58e8f9736b60843d27317010a3f56d99448e7 100644 --- a/src/plugins/select/bluegene/plugin/defined_block.c +++ b/src/plugins/select/bluegene/plugin/defined_block.c @@ -93,27 +93,18 @@ extern int create_defined_blocks(bg_layout_t overlapped, bg_found_block_list); while ((found_record = (bg_record_t*) list_next(itr_found)) != NULL) { -/* info("%s.%d.%d ?= %s.%d.%d\n", */ +/* info("%s[%s[ ?= %s[%s]\n", */ /* bg_record->nodes, */ -/* bg_record->quarter, */ -/* bg_record->nodecard, */ +/* bg_record->ionodes, */ /* found_record->nodes, */ -/* found_record->quarter, */ -/* found_record->nodecard); */ +/* found_record->ionodes); */ if ((bit_equal(bg_record->bitmap, found_record->bitmap)) -#ifdef HAVE_BGL - && (bg_record->quarter == - found_record->quarter) - && (bg_record->nodecard == - found_record->nodecard) -#else && (bit_equal(bg_record-> ionode_bitmap, found_record-> ionode_bitmap)) -#endif ) { /* don't reboot this one */ break; @@ -305,7 +296,7 @@ extern int create_full_system_block(List bg_found_block_list) list_iterator_destroy(itr); bit_not(bitmap); - if(bit_ffs(bitmap)) { + if(bit_ffs(bitmap) != -1) { error("We don't have the entire system covered by partitions, " "can't create full system block"); FREE_NULL_BITMAP(bitmap); @@ -380,7 +371,7 @@ extern int create_full_system_block(List bg_found_block_list) blockreq.block = name; blockreq.conn_type = SELECT_TORUS; - add_bg_record(records, NULL, &blockreq); + add_bg_record(records, NULL, &blockreq, 0 , 0); xfree(name); bg_record = (bg_record_t *) list_pop(records); diff --git a/src/plugins/select/bluegene/plugin/dynamic_block.c b/src/plugins/select/bluegene/plugin/dynamic_block.c index 8131c206d5fae49544b3e3145b2a5ea2fb7795a5..f8665cb4180420d82a5e7af3370e16f2d0964c1d 100644 --- a/src/plugins/select/bluegene/plugin/dynamic_block.c +++ b/src/plugins/select/bluegene/plugin/dynamic_block.c @@ -39,13 +39,9 @@ #include "dynamic_block.h" -#ifdef HAVE_BGL -static int _split_block(List block_list, List new_blocks, - bg_record_t *bg_record, int procs); -#else static int _split_block(List block_list, List new_blocks, bg_record_t *bg_record, int cnodes); -#endif + static int _breakup_blocks(List block_list, List new_blocks, ba_request_t *request, List my_block_list); @@ -139,44 +135,18 @@ extern List create_dynamic_block(List block_list, xfree(nodes); FREE_NULL_BITMAP(bitmap); } -#ifdef HAVE_BGL - if(request->size==1 && cnodes < bluegene_bp_node_cnt) { - request->conn_type = SELECT_SMALL; - if(request->procs == (procs_per_node/16)) { - if(!bluegene_nodecard_ionode_cnt) { - error("can't create this size %d " - "on this system numpsets is %d", - request->procs, - bluegene_numpsets); - goto finished; - } - blockreq.small32=4; - blockreq.small128=3; - } else { - if(!bluegene_quarter_ionode_cnt) { - error("can't create this size %d " - "on this system numpsets is %d", - request->procs, - bluegene_numpsets); - goto finished; - } - blockreq.small128=4; - } - new_blocks = list_create(destroy_bg_record); - if(_breakup_blocks(block_list, new_blocks, - request, my_block_list) - != SLURM_SUCCESS) { - list_destroy(new_blocks); - new_blocks = NULL; - debug2("small block not able to be placed"); - //rc = SLURM_ERROR; - } else - goto finished; - } -#else if(request->size==1 && cnodes < bluegene_bp_node_cnt) { switch(cnodes) { +#ifdef HAVE_BGL + case 32: + blockreq.small32 = 4; + blockreq.small128 = 3; + break; + case 128: + blockreq.small128 = 4; + break; +#else case 16: blockreq.small16 = 2; blockreq.small32 = 1; @@ -202,6 +172,7 @@ extern List create_dynamic_block(List block_list, case 256: blockreq.small256 = 2; break; +#endif default: error("This size %d is unknown on this system", cnodes); goto finished; @@ -220,7 +191,7 @@ extern List create_dynamic_block(List block_list, } else goto finished; } -#endif + if(request->conn_type == SELECT_NAV) request->conn_type = SELECT_TORUS; @@ -253,16 +224,8 @@ extern List create_dynamic_block(List block_list, set in the ionode_bitmap. */ if(bg_record->job_running == NO_JOB_RUNNING -#ifdef HAVE_BGL - && (bg_record->quarter == (uint16_t) NO_VAL - || (bg_record->quarter == 0 - && (bg_record->nodecard == (uint16_t) NO_VAL - || bg_record->nodecard == 0))) -#else && ((bg_record->node_cnt >= bluegene_bp_node_cnt) - || (bit_ffs(bg_record->ionode_bitmap) == 0)) -#endif - ) { + || (bit_ffs(bg_record->ionode_bitmap) == 0))) { for(i=0; i<BA_SYSTEM_DIMENSIONS; i++) request->start[i] = @@ -327,7 +290,7 @@ no_list: blockreq.ramdiskimage = request->ramdiskimage; blockreq.conn_type = request->conn_type; - add_bg_record(new_blocks, results, &blockreq); + add_bg_record(new_blocks, results, &blockreq, 0, 0); finished: reset_all_removed_bps(); @@ -346,91 +309,14 @@ finished: return new_blocks; } -#ifdef HAVE_BGL -extern bg_record_t *create_small_record(bg_record_t *bg_record, - uint16_t quarter, uint16_t nodecard) -{ - bg_record_t *found_record = NULL; - int small_size = 4; - ba_node_t *new_ba_node = NULL; - ba_node_t *ba_node = NULL; - found_record = (bg_record_t*) xmalloc(sizeof(bg_record_t)); - - found_record->job_running = NO_JOB_RUNNING; - found_record->user_name = xstrdup(bg_record->user_name); - found_record->user_uid = bg_record->user_uid; - found_record->bg_block_list = list_create(destroy_ba_node); - ba_node = list_peek(bg_record->bg_block_list); - if(!ba_node) { - hostlist_t hl = hostlist_create(bg_record->nodes); - char *host = hostlist_shift(hl); - hostlist_destroy(hl); - found_record->nodes = xstrdup(host); - free(host); - error("you gave me a list with no ba_nodes using %s", - found_record->nodes); - } else { - int i=0,j=0; - new_ba_node = ba_copy_node(ba_node); - for (i=0; i<BA_SYSTEM_DIMENSIONS; i++){ - for(j=0;j<NUM_PORTS_PER_NODE;j++) { - ba_node->axis_switch[i].int_wire[j].used = 0; - if(i!=X) { - if(j==3 || j==4) - ba_node->axis_switch[i]. - int_wire[j]. - used = 1; - } - ba_node->axis_switch[i].int_wire[j]. - port_tar = j; - } - } - list_append(found_record->bg_block_list, new_ba_node); - found_record->bp_count = 1; - found_record->nodes = xstrdup_printf( - "%s%c%c%c", - bg_slurm_node_prefix, - alpha_num[ba_node->coord[X]], - alpha_num[ba_node->coord[Y]], - alpha_num[ba_node->coord[Z]]); - } - - found_record->blrtsimage = xstrdup(bg_record->blrtsimage); - found_record->linuximage = xstrdup(bg_record->linuximage); - found_record->mloaderimage = xstrdup(bg_record->mloaderimage); - found_record->ramdiskimage = xstrdup(bg_record->ramdiskimage); - - process_nodes(found_record, false); - - found_record->conn_type = SELECT_SMALL; - - found_record->node_use = SELECT_COPROCESSOR_MODE; - - if(nodecard != (uint16_t) NO_VAL) - small_size = bluegene_bp_nodecard_cnt; - found_record->cpu_cnt = procs_per_node/small_size; - found_record->node_cnt = bluegene_bp_node_cnt/small_size; - found_record->quarter = quarter; - found_record->nodecard = nodecard; - - if(set_ionodes(found_record) == SLURM_ERROR) - error("couldn't create ionode_bitmap for %d.%d", - found_record->quarter, found_record->nodecard); - return found_record; -} - -#else extern bg_record_t *create_small_record(bg_record_t *bg_record, bitstr_t *ionodes, int size) { bg_record_t *found_record = NULL; ba_node_t *new_ba_node = NULL; ba_node_t *ba_node = NULL; -#ifdef HAVE_BGL - int small_size = 4; -#else char bitstring[BITSIZE]; -#endif + found_record = (bg_record_t*) xmalloc(sizeof(bg_record_t)); found_record->job_running = NO_JOB_RUNNING; @@ -472,6 +358,7 @@ extern bg_record_t *create_small_record(bg_record_t *bg_record, alpha_num[ba_node->coord[Z]]); } #ifdef HAVE_BGL + found_record->node_use = SELECT_COPROCESSOR_MODE; found_record->blrtsimage = xstrdup(bg_record->blrtsimage); #endif found_record->linuximage = xstrdup(bg_record->linuximage); @@ -482,19 +369,6 @@ extern bg_record_t *create_small_record(bg_record_t *bg_record, found_record->conn_type = SELECT_SMALL; -#ifdef HAVE_BGL - found_record->node_use = SELECT_COPROCESSOR_MODE; - if(nodecard != (uint16_t) NO_VAL) - small_size = bluegene_bp_nodecard_cnt; - found_record->cpu_cnt = procs_per_node/small_size; - found_record->node_cnt = bluegene_bp_node_cnt/small_size; - found_record->quarter = quarter; - found_record->nodecard = nodecard; - - if(set_ionodes(found_record) == SLURM_ERROR) - error("couldn't create ionode_bitmap for %d.%d", - found_record->quarter, found_record->nodecard); -#else xassert(bluegene_proc_ratio); found_record->cpu_cnt = bluegene_proc_ratio * size; found_record->node_cnt = size; @@ -502,325 +376,57 @@ extern bg_record_t *create_small_record(bg_record_t *bg_record, found_record->ionode_bitmap = bit_copy(ionodes); bit_fmt(bitstring, BITSIZE, found_record->ionode_bitmap); found_record->ionodes = xstrdup(bitstring); -#endif + return found_record; } -#endif /*********************** Local Functions *************************/ -#ifdef HAVE_BGL static int _split_block(List block_list, List new_blocks, - bg_record_t *bg_record, int procs) + bg_record_t *bg_record, int cnodes) { - bg_record_t *found_record = NULL; bool full_bp = false; - int small_count = 0; - int small_size = 0; - uint16_t num_nodecard = 0, num_quarter = 0; - int i; - int node_cnt = 0; - uint16_t quarter = 0; - uint16_t nodecard = 0; - - if(bg_record->quarter == (uint16_t) NO_VAL) - full_bp = true; - - if(procs == (procs_per_node/bluegene_bp_nodecard_cnt) - && bluegene_nodecard_ionode_cnt) { - num_nodecard=4; - if(full_bp) - num_quarter=3; - } else if(full_bp) { - num_quarter = 4; - } else { - error("you asked for something that was already this size"); - return SLURM_ERROR; - } - debug2("asking for %d 32s from a %d block", - num_nodecard, bg_record->node_cnt); - small_count = num_nodecard+num_quarter; - - /* break base partition up into 16 parts */ - small_size = bluegene_bp_node_cnt/bluegene_nodecard_node_cnt; - node_cnt = 0; - if(!full_bp) - quarter = bg_record->quarter; - else - quarter = 0; - nodecard = 0; - for(i=0; i<small_count; i++) { - if(i == num_nodecard) { - /* break base partition up into 4 parts */ - small_size = 4; - } - - if(small_size == 4) - nodecard = (uint16_t)NO_VAL; - else - nodecard = i%4; - found_record = create_small_record(bg_record, - quarter, - nodecard); - list_append(new_blocks, found_record); - - node_cnt += bluegene_bp_node_cnt/small_size; - if(node_cnt == 128) { - node_cnt = 0; - quarter++; - } - } - - return SLURM_SUCCESS; -} - -static int _breakup_blocks(List block_list, List new_blocks, - ba_request_t *request, List my_block_list) -{ - int rc = SLURM_ERROR; - bg_record_t *bg_record = NULL; - ListIterator itr; - int total_proc_cnt=0; - uint16_t last_quarter = (uint16_t) NO_VAL; - char tmp_char[256]; - - debug2("proc count = %d size = %d", - request->procs, request->size); + bitoff_t start = 0; + blockreq_t blockreq; - itr = list_iterator_create(block_list); - while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { - if(bg_record->job_running != NO_JOB_RUNNING) - continue; - if(bg_record->state != RM_PARTITION_FREE) - continue; - if (request->avail_node_bitmap && - !bit_super_set(bg_record->bitmap, - request->avail_node_bitmap)) { - debug2("bg block %s has nodes not usable by this job", - bg_record->bg_block_id); - continue; - } - - if(request->start_req) { - if ((request->start[X] != bg_record->start[X]) - || (request->start[Y] != bg_record->start[Y]) - || (request->start[Z] != bg_record->start[Z])) { - debug4("small got %c%c%c looking for %c%c%c", - alpha_num[bg_record->start[X]], - alpha_num[bg_record->start[Y]], - alpha_num[bg_record->start[Z]], - alpha_num[request->start[X]], - alpha_num[request->start[Y]], - alpha_num[request->start[Z]]); - continue; - } - debug3("small found %c%c%c looking for %c%c%c", - alpha_num[bg_record->start[X]], - alpha_num[bg_record->start[Y]], - alpha_num[bg_record->start[Z]], - alpha_num[request->start[X]], - alpha_num[request->start[Y]], - alpha_num[request->start[Z]]); - } + memset(&blockreq, 0, sizeof(blockreq_t)); - if(bg_record->cpu_cnt == request->procs) { - debug2("found it here %s, %s", - bg_record->bg_block_id, - bg_record->nodes); - request->save_name = xstrdup_printf( - "%c%c%c", - alpha_num[bg_record->start[X]], - alpha_num[bg_record->start[Y]], - alpha_num[bg_record->start[Z]]); - rc = SLURM_SUCCESS; - goto finished; - } - if(bg_record->node_cnt > bluegene_bp_node_cnt) - continue; - if(bg_record->cpu_cnt < request->procs) { - if(last_quarter != bg_record->quarter){ - last_quarter = bg_record->quarter; - total_proc_cnt = bg_record->cpu_cnt; - } else { - total_proc_cnt += bg_record->cpu_cnt; - } - debug2("1 got %d on quarter %d", - total_proc_cnt, last_quarter); - if(total_proc_cnt == request->procs) { - request->save_name = xstrdup_printf( - "%c%c%c", - alpha_num[bg_record->start[X]], - alpha_num[bg_record->start[Y]], - alpha_num[bg_record->start[Z]]); - if(!my_block_list) { - rc = SLURM_SUCCESS; - goto finished; - } - - bg_record = create_small_record( - bg_record, - last_quarter, - (uint16_t) NO_VAL); - list_append(new_blocks, bg_record); - - rc = SLURM_SUCCESS; - goto finished; - } - continue; - } + switch(bg_record->node_cnt) { +#ifdef HAVE_BGL + case 32: + error("We got a 32 we should never have this"); + goto finished; break; - } - if(bg_record) { - debug2("got one on the first pass"); - goto found_one; - } - list_iterator_reset(itr); - last_quarter = (uint16_t) NO_VAL; - while ((bg_record = (bg_record_t *) list_next(itr)) - != NULL) { - if(bg_record->job_running != NO_JOB_RUNNING) - continue; - if (request->avail_node_bitmap && - !bit_super_set(bg_record->bitmap, - request->avail_node_bitmap)) { - debug2("bg block %s has nodes not usable by this job", - bg_record->bg_block_id); - continue; - } - - if(request->start_req) { - if ((request->start[X] != bg_record->start[X]) - || (request->start[Y] != bg_record->start[Y]) - || (request->start[Z] != bg_record->start[Z])) { - debug4("small 2 got %c%c%c looking for %c%c%c", - alpha_num[bg_record->start[X]], - alpha_num[bg_record->start[Y]], - alpha_num[bg_record->start[Z]], - alpha_num[request->start[X]], - alpha_num[request->start[Y]], - alpha_num[request->start[Z]]); - continue; - } - debug3("small 2 found %c%c%c looking for %c%c%c", - alpha_num[bg_record->start[X]], - alpha_num[bg_record->start[Y]], - alpha_num[bg_record->start[Z]], - alpha_num[request->start[X]], - alpha_num[request->start[Y]], - alpha_num[request->start[Z]]); - } - - if(bg_record->cpu_cnt == request->procs) { - debug2("found it here %s, %s", - bg_record->bg_block_id, - bg_record->nodes); - request->save_name = xstrdup_printf( - "%c%c%c", - alpha_num[bg_record->start[X]], - alpha_num[bg_record->start[Y]], - alpha_num[bg_record->start[Z]]); - rc = SLURM_SUCCESS; + case 128: + switch(cnodes) { + case 32: + blockreq.small32 = 4; + break; + default: + error("We don't make a %d from size %d", + cnodes, bg_record->node_cnt); goto finished; - } - - if(bg_record->node_cnt > bluegene_bp_node_cnt) - continue; - if(bg_record->cpu_cnt < request->procs) { - if(last_quarter != bg_record->quarter){ - last_quarter = bg_record->quarter; - total_proc_cnt = bg_record->cpu_cnt; - } else { - total_proc_cnt += bg_record->cpu_cnt; - } - debug2("got %d on quarter %d", - total_proc_cnt, last_quarter); - if(total_proc_cnt == request->procs) { - request->save_name = xstrdup_printf( - "%c%c%c", - alpha_num[bg_record->start[X]], - alpha_num[bg_record->start[Y]], - alpha_num[bg_record->start[Z]]); - if(!my_block_list) { - rc = SLURM_SUCCESS; - goto finished; - } - bg_record = create_small_record( - bg_record, - last_quarter, - (uint16_t) NO_VAL); - list_append(new_blocks, bg_record); - - rc = SLURM_SUCCESS; - goto finished; - } - continue; - } - break; - } -found_one: - if(bg_record) { - List temp_list = NULL; - bg_record_t *found_record = NULL; - - if(bg_record->original) { - debug3("This was a copy"); - found_record = bg_record->original; - } else { - debug3("looking for original"); - found_record = find_org_in_bg_list( - bg_list, bg_record); + break; } - if(!found_record) { - error("this record wasn't found in the list!"); - rc = SLURM_ERROR; + break; + default: + switch(cnodes) { + case 32: + blockreq.small32 = 4; + blockreq.small128 = 3; + break; + case 128: + blockreq.small128 = 4; + break; + default: + error("We don't make a %d from size %d", + cnodes, bg_record->node_cnt); goto finished; + break; } - - format_node_name(found_record, tmp_char, sizeof(tmp_char)); - - debug2("going to split %s, %s", - found_record->bg_block_id, - tmp_char); - request->save_name = xstrdup_printf( - "%c%c%c", - alpha_num[found_record->start[X]], - alpha_num[found_record->start[Y]], - alpha_num[found_record->start[Z]]); - if(!my_block_list) { - rc = SLURM_SUCCESS; - goto finished; - } - _split_block(block_list, new_blocks, - found_record, request->procs); - remove_from_bg_list(block_list, bg_record); - destroy_bg_record(bg_record); - remove_from_bg_list(bg_list, found_record); - temp_list = list_create(NULL); - list_push(temp_list, found_record); - num_block_to_free++; - free_block_list(temp_list); - list_destroy(temp_list); - rc = SLURM_SUCCESS; - goto finished; - } - -finished: - list_iterator_destroy(itr); - - return rc; -} + full_bp = true; + break; #else - -static int _split_block(List block_list, List new_blocks, - bg_record_t *bg_record, int cnodes) -{ - bool full_bp = false; - bitoff_t start = 0; - blockreq_t blockreq; - - memset(&blockreq, 0, sizeof(blockreq_t)); - - switch(bg_record->node_cnt) { case 16: error("We got a 16 we should never have this"); goto finished; @@ -936,17 +542,25 @@ static int _split_block(List block_list, List new_blocks, } full_bp = true; break; +#endif } if(!full_bp && bg_record->ionode_bitmap) start = bit_ffs(bg_record->ionode_bitmap); +#ifdef HAVE_BGL + debug2("Asking for %u 32CNBlocks, and %u 128CNBlocks " + "from a %u block, starting at ionode %d.", + blockreq.small32, blockreq.small128, + bg_record->node_cnt, start); +#else debug2("Asking for %u 16CNBlocks, %u 32CNBlocks, " "%u 64CNBlocks, %u 128CNBlocks, and %u 256CNBlocks" "from a %u block, starting at ionode %d.", blockreq.small16, blockreq.small32, blockreq.small64, blockreq.small128, blockreq.small256, bg_record->node_cnt, start); +#endif handle_small_record_request(new_blocks, &blockreq, bg_record, start); finished: @@ -996,7 +610,7 @@ static int _breakup_blocks(List block_list, List new_blocks, * smallest blocks. */ again: - while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { + while ((bg_record = list_next(itr))) { if(bg_record->job_running != NO_JOB_RUNNING) continue; /* on the third time through look for just a block @@ -1055,7 +669,7 @@ again: } /* lets see if we can combine some small ones */ if(bg_record->node_cnt < cnodes) { - //char bitstring[BITSIZE]; + char bitstring[BITSIZE]; bitstr_t *bitstr = NULL; bit_or(ionodes, bg_record->ionode_bitmap); @@ -1075,9 +689,12 @@ again: } else total_cnode_cnt += bg_record->node_cnt; - //bit_fmt(bitstring, BITSIZE, ionodes); - debug2("1 adding %d got %d set", - bg_record->node_cnt, total_cnode_cnt); + bit_fmt(bitstring, BITSIZE, ionodes); + debug2("1 adding %s %d got %d set " + "ionodes %s total is %s", + bg_record->bg_block_id, + bg_record->node_cnt, total_cnode_cnt, + bg_record->ionodes, bitstring); if(total_cnode_cnt == cnodes) { request->save_name = xstrdup_printf( "%c%c%c", @@ -1167,5 +784,3 @@ finished: return rc; } - -#endif diff --git a/src/plugins/select/bluegene/plugin/dynamic_block.h b/src/plugins/select/bluegene/plugin/dynamic_block.h index 1555f8f8d980ed28907d31d6f4a743b0f701cf92..04dc4da43dab55f4fbb74b47ea78c53679d714e2 100644 --- a/src/plugins/select/bluegene/plugin/dynamic_block.h +++ b/src/plugins/select/bluegene/plugin/dynamic_block.h @@ -46,7 +46,7 @@ extern List create_dynamic_block(List block_list, ba_request_t *request, List my_block_list, bool track_down_nodes); -#ifdef HAVE_BGL +#ifdef HAVE_BGQ extern bg_record_t *create_small_record(bg_record_t *bg_record, uint16_t quarter, uint16_t nodecard); #else diff --git a/src/plugins/select/bluegene/plugin/select_bluegene.c b/src/plugins/select/bluegene/plugin/select_bluegene.c index eff40fd44c28cbe1b5e684fcbd56b3424355a671..3c81e88e65ae95b1ceaeeb37a560dbeb3ae79326 100644 --- a/src/plugins/select/bluegene/plugin/select_bluegene.c +++ b/src/plugins/select/bluegene/plugin/select_bluegene.c @@ -85,7 +85,8 @@ const char plugin_type[] = "select/bluegene"; const uint32_t plugin_version = 100; /* pthread stuff for updating BG node status */ -static pthread_t bluegene_thread = 0; +static pthread_t block_thread = 0; +static pthread_t state_thread = 0; static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER; /** initialize the status pthread */ @@ -136,18 +137,23 @@ static int _init_status_pthread(void) pthread_attr_t attr; pthread_mutex_lock( &thread_flag_mutex ); - if ( bluegene_thread ) { - debug2("Bluegene thread already running, not starting " - "another"); + if ( block_thread ) { + debug2("Bluegene threads already running, not starting " + "another"); pthread_mutex_unlock( &thread_flag_mutex ); return SLURM_ERROR; } slurm_attr_init( &attr ); /* since we do a join on this later we don't make it detached */ - if (pthread_create( &bluegene_thread, &attr, bluegene_agent, NULL) + if (pthread_create( &block_thread, &attr, block_agent, NULL) != 0) - error("Failed to create bluegene_agent thread"); + error("Failed to create block_agent thread"); + slurm_attr_init( &attr ); + /* since we do a join on this later we don't make it detached */ + if (pthread_create( &state_thread, &attr, state_agent, NULL) + != 0) + error("Failed to create state_agent thread"); pthread_mutex_unlock( &thread_flag_mutex ); slurm_attr_destroy( &attr ); @@ -177,10 +183,14 @@ extern int fini ( void ) agent_fini = true; pthread_mutex_lock( &thread_flag_mutex ); - if ( bluegene_thread ) { + if ( block_thread ) { verbose("Bluegene select plugin shutting down"); - pthread_join(bluegene_thread, NULL); - bluegene_thread = 0; + pthread_join(block_thread, NULL); + block_thread = 0; + } + if ( state_thread ) { + pthread_join(state_thread, NULL); + state_thread = 0; } pthread_mutex_unlock( &thread_flag_mutex ); fini_bg(); @@ -455,7 +465,7 @@ extern int select_p_pack_node_info(time_t last_query_time, Buf *buffer_ptr) debug2("Node select info hasn't changed since %d", last_bg_update); return SLURM_NO_CHANGE_IN_DATA; - } else { + } else if(blocks_are_created) { *buffer_ptr = NULL; buffer = init_buf(HUGE_BUF_SIZE); pack32(blocks_packed, buffer); @@ -464,10 +474,7 @@ extern int select_p_pack_node_info(time_t last_query_time, Buf *buffer_ptr) if(bg_list) { slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_list); - while ((bg_record = (bg_record_t *) list_next(itr)) - != NULL) { - xassert(bg_record->bg_block_id != NULL); - + while ((bg_record = list_next(itr))) { pack_block(bg_record, buffer); blocks_packed++; } @@ -500,6 +507,9 @@ extern int select_p_pack_node_info(time_t last_query_time, Buf *buffer_ptr) set_buf_offset(buffer, tmp_offset); *buffer_ptr = buffer; + } else { + error("select_p_pack_node_info: bg_list not ready yet"); + return SLURM_ERROR; } return SLURM_SUCCESS; @@ -602,23 +612,9 @@ extern int select_p_update_block (update_part_msg_t *part_desc_ptr) } if(!part_desc_ptr->state_up) { - /* Since we are putting this block in an error state we need - to wait for the job to be removed. We don't really - need to free the block though since we may just - want it to be in an error state for some reason. */ - while(bg_record->job_running > NO_JOB_RUNNING) - sleep(1); - - slurm_mutex_lock(&block_state_mutex); - bg_record->job_running = BLOCK_ERROR_STATE; - bg_record->state = RM_PARTITION_ERROR; - slurm_mutex_unlock(&block_state_mutex); - trigger_block_error(); + put_block_in_error_state(bg_record, BLOCK_ERROR_STATE); } else if(part_desc_ptr->state_up){ - slurm_mutex_lock(&block_state_mutex); - bg_record->job_running = NO_JOB_RUNNING; - bg_record->state = RM_PARTITION_FREE; - slurm_mutex_unlock(&block_state_mutex); + resume_block(bg_record); } else { return rc; } @@ -631,19 +627,12 @@ extern int select_p_update_block (update_part_msg_t *part_desc_ptr) extern int select_p_update_sub_node (update_part_msg_t *part_desc_ptr) { int rc = SLURM_SUCCESS; - bg_record_t *bg_record = NULL, *found_record = NULL; - time_t now; - char reason[128], tmp[64], time_str[32]; - blockreq_t blockreq; int i = 0, j = 0; - char coord[BA_SYSTEM_DIMENSIONS]; + char coord[BA_SYSTEM_DIMENSIONS+1], *node_name = NULL; char ionodes[128]; int set = 0; - int set_error = 0; + double nc_pos = 0, last_pos = -1; bitstr_t *ionode_bitmap = NULL; - List requests = NULL; - List delete_list = NULL; - ListIterator itr; if(bluegene_layout_mode != LAYOUT_DYNAMIC) { info("You can't use this call unless you are on a Dynamically " @@ -652,7 +641,7 @@ extern int select_p_update_sub_node (update_part_msg_t *part_desc_ptr) goto end_it; } - memset(coord, -1, BA_SYSTEM_DIMENSIONS); + memset(coord, 0, sizeof(coord)); memset(ionodes, 0, 128); if(!part_desc_ptr->name) { error("update_sub_node: No name specified"); @@ -661,10 +650,6 @@ extern int select_p_update_sub_node (update_part_msg_t *part_desc_ptr) } - now = time(NULL); - slurm_make_time_str(&now, time_str, sizeof(time_str)); - snprintf(tmp, sizeof(tmp), "[SLURM@%s]", time_str); - while (part_desc_ptr->name[j] != '\0') { if (part_desc_ptr->name[j] == '[') { if(set<1) { @@ -715,9 +700,9 @@ extern int select_p_update_sub_node (update_part_msg_t *part_desc_ptr) goto end_it; } } + strncpy(coord, part_desc_ptr->name+j, BA_SYSTEM_DIMENSIONS); - j += BA_SYSTEM_DIMENSIONS-1; set++; } @@ -731,166 +716,33 @@ extern int select_p_update_sub_node (update_part_msg_t *part_desc_ptr) goto end_it; } ionode_bitmap = bit_alloc(bluegene_numpsets); - bit_unfmt(ionode_bitmap, ionodes); - - requests = list_create(destroy_bg_record); - memset(&blockreq, 0, sizeof(blockreq_t)); - - blockreq.block = coord; - blockreq.conn_type = SELECT_SMALL; - blockreq.small32 = bluegene_bp_nodecard_cnt; - - add_bg_record(requests, NULL, &blockreq); + bit_unfmt(ionode_bitmap, ionodes); - delete_list = list_create(NULL); - while((bg_record = list_pop(requests))) { - set_error = 0; - if(bit_overlap(bg_record->ionode_bitmap, ionode_bitmap)) - set_error = 1; - - slurm_mutex_lock(&block_state_mutex); - itr = list_iterator_create(bg_list); - while((found_record = list_next(itr))) { - if(!found_record || (bg_record == found_record)) - continue; - if(bit_equal(bg_record->bitmap, found_record->bitmap) - && bit_equal(bg_record->ionode_bitmap, - found_record->ionode_bitmap)) { - debug2("block %s[%s] already there", - found_record->nodes, - found_record->ionodes); - /* we don't need to set this error, it - doesn't overlap - */ - if(!set_error) - break; - - snprintf(reason, sizeof(reason), - "update_sub_node: " - "Admin set block %s state to %s %s", - found_record->bg_block_id, - _block_state_str( - part_desc_ptr->state_up), - tmp); - info("%s",reason); - if(found_record->job_running - > NO_JOB_RUNNING) { - slurm_fail_job( - found_record->job_running); + node_name = xstrdup_printf("%s%s", bg_slurm_node_prefix, coord); + /* find out how many nodecards to get for each ionode */ + if(!part_desc_ptr->state_up) { + for(i = 0; i<bluegene_numpsets; i++) { + if(bit_test(ionode_bitmap, i)) { + if((int)nc_pos != (int)last_pos) { + down_nodecard(node_name, i); + last_pos = nc_pos; } - - if(!part_desc_ptr->state_up) { - found_record->job_running = - BLOCK_ERROR_STATE; - found_record->state = - RM_PARTITION_ERROR; - trigger_block_error(); - } else if(part_desc_ptr->state_up){ - found_record->job_running = - NO_JOB_RUNNING; - found_record->state = - RM_PARTITION_FREE; - } else { - error("update_sub_node: " - "Unknown state %d given", - part_desc_ptr->state_up); - rc = SLURM_ERROR; - break; - } - break; - } else if(!set_error - && bit_equal(bg_record->bitmap, - found_record->bitmap) - && bit_overlap( - bg_record->ionode_bitmap, - found_record->ionode_bitmap)) { - break; } - - } - list_iterator_destroy(itr); - slurm_mutex_unlock(&block_state_mutex); - /* we already found an existing record */ - if(found_record) { - destroy_bg_record(bg_record); - continue; + nc_pos += bluegene_nc_ratio; } - /* we need to add this record since it doesn't exist */ - if(configure_block(bg_record) == SLURM_ERROR) { - destroy_bg_record(bg_record); - error("update_sub_node: " - "unable to configure block in api"); - } - debug2("adding block %s to fill in small blocks " - "around bad blocks", - bg_record->bg_block_id); - print_bg_record(bg_record); - slurm_mutex_lock(&block_state_mutex); - list_append(bg_list, bg_record); - slurm_mutex_unlock(&block_state_mutex); - - /* We are just adding the block not deleting any or - setting this one to an error state. - */ - if(!set_error) - continue; - - if(!part_desc_ptr->state_up) { - bg_record->job_running = BLOCK_ERROR_STATE; - bg_record->state = RM_PARTITION_ERROR; - trigger_block_error(); - } else if(part_desc_ptr->state_up){ - bg_record->job_running = NO_JOB_RUNNING; - bg_record->state = RM_PARTITION_FREE; - } else { - error("update_sub_node: Unknown state %d given", - part_desc_ptr->state_up); - rc = SLURM_ERROR; - continue; - } - snprintf(reason, sizeof(reason), - "update_sub_node: " - "Admin set block %s state to %s %s", - bg_record->bg_block_id, - _block_state_str(part_desc_ptr->state_up), - tmp); - info("%s",reason); - - /* remove overlapping blocks */ - slurm_mutex_lock(&block_state_mutex); - itr = list_iterator_create(bg_list); - while((found_record = list_next(itr))) { - if ((!found_record) || (bg_record == found_record)) - continue; - if(!blocks_overlap(bg_record, found_record)) { - debug2("block %s isn't part of %s", - found_record->bg_block_id, - bg_record->bg_block_id); - continue; - } - debug2("removing block %s because there is something " - "wrong with part of the base partition", - found_record->bg_block_id); - if(found_record->job_running > NO_JOB_RUNNING) { - slurm_fail_job(found_record->job_running); - } - list_push(delete_list, found_record); - list_remove(itr); - num_block_to_free++; - } - list_iterator_destroy(itr); - free_block_list(delete_list); - slurm_mutex_unlock(&block_state_mutex); + } else if(part_desc_ptr->state_up){ + up_nodecard(node_name, ionode_bitmap); + } else { + return rc; } - list_destroy(delete_list); + FREE_NULL_BITMAP(ionode_bitmap); - + xfree(node_name); /* This only works for the error state, not free */ last_bg_update = time(NULL); - -end_it: - return rc; +end_it: + return SLURM_SUCCESS; } extern int select_p_get_info_from_plugin (enum select_data_info info, diff --git a/src/plugins/select/bluegene/plugin/state_test.c b/src/plugins/select/bluegene/plugin/state_test.c index 052c9812e3d6615642996ee04d70328490866832..5264caa798c2861f897a9792f5373d83ca6cd6be 100644 --- a/src/plugins/select/bluegene/plugin/state_test.c +++ b/src/plugins/select/bluegene/plugin/state_test.c @@ -140,28 +140,6 @@ static void _configure_node_down(rm_bp_id_t bp_id, my_bluegene_t *my_bg) } } -/* Convert base partition state value to a string */ -static char *_convert_bp_state(rm_BP_state_t state) -{ - switch(state) { - case RM_BP_UP: - return "RM_BP_UP"; - break; - case RM_BP_DOWN: - return "RM_BP_DOWN"; - break; - case RM_BP_MISSING: - return "RM_BP_MISSING"; - break; - case RM_BP_ERROR: - return "RM_BP_ERROR"; - break; - case RM_BP_NAV: - return "RM_BP_NAV"; - } - return "BP_STATE_UNIDENTIFIED!"; -} - static int _test_down_nodecards(rm_BP_t *bp_ptr) { rm_bp_id_t bp_id = NULL; @@ -171,18 +149,17 @@ static int _test_down_nodecards(rm_BP_t *bp_ptr) int rc = SLURM_SUCCESS; rm_nodecard_list_t *ncard_list = NULL; rm_nodecard_t *ncard = NULL; - rm_nodecard_state state; - bitstr_t *ionode_bitmap = NULL; - bg_record_t *bg_record = NULL; + rm_nodecard_state_t state; + //bitstr_t *ionode_bitmap = NULL; + //bg_record_t *bg_record = NULL; int *coord = NULL; - char *node_name_tmp = NULL; - struct node_record *node_ptr = NULL; - int bp_bit = 0; - int set = 0, io_cnt = 1; + char *node_name = NULL; + //int bp_bit = 0; + //int io_cnt = 1; /* Translate 1 nodecard count to ionode count */ - if((io_cnt *= bluegene_io_ratio)) - io_cnt--; +/* if((io_cnt *= bluegene_io_ratio)) */ +/* io_cnt--; */ if ((rc = bridge_get_data(bp_ptr, RM_BPID, &bp_id)) != STATUS_OK) { @@ -204,7 +181,7 @@ static int _test_down_nodecards(rm_BP_t *bp_ptr) error("Could not find coordinates for " "BP ID %s", (char *) bp_id); rc = SLURM_ERROR; - goto cleanup; + goto clean_up; } node_name = xstrdup_printf("%s%c%c%c", @@ -222,7 +199,7 @@ static int _test_down_nodecards(rm_BP_t *bp_ptr) } for(i=0; i<num; i++) { - int nc_id = 0, io_start = 0; + int io_start = 0; if (i) { if ((rc = bridge_get_data(ncard_list, @@ -232,7 +209,7 @@ static int _test_down_nodecards(rm_BP_t *bp_ptr) "(RM_NodeCardListNext): %s", rc); rc = SLURM_ERROR; - goto cleanup; + goto clean_up; } } else { if ((rc = bridge_get_data(ncard_list, @@ -242,7 +219,7 @@ static int _test_down_nodecards(rm_BP_t *bp_ptr) "(RM_NodeCardListFirst: %s", rc); rc = SLURM_ERROR; - goto cleanup; + goto clean_up; } } if ((rc = bridge_get_data(ncard, @@ -251,7 +228,7 @@ static int _test_down_nodecards(rm_BP_t *bp_ptr) error("bridge_get_data(RM_NodeCardState: %s", rc); rc = SLURM_ERROR; - goto cleanup; + goto clean_up; } if(state == RM_NODECARD_UP) @@ -273,45 +250,84 @@ static int _test_down_nodecards(rm_BP_t *bp_ptr) debug("nodecard %s on %s is in an error state", nc_name, node_name); +#ifdef HAVE_BGL + if ((rc = bridge_get_data(ncard, + RM_NodeCardQuarter, + &io_start)) != STATUS_OK) { + error("bridge_get_data(CardQuarter): %d",rc); + goto clean_up; + } + io_start *= bluegene_quarter_ionode_cnt; + io_start += bluegene_nodecard_ionode_cnt * (i%4); +#else /* From the first nodecard id we can figure out where to start from with the alloc of ionodes. */ - nc_id = atoi((char*)nc_name+1); + io_start = atoi((char*)nc_name+1); + io_start *= bluegene_io_ratio; +#endif free(nc_name); - io_start = nc_id * bluegene_io_ratio; - - if(!ionode_bitmap) - ionode_bitmap = bit_alloc(bluegene_numpsets); - - bit_nset(ionode_bitmap, io_start, io_start+io_cnt); +/* if(!ionode_bitmap) */ +/* ionode_bitmap = bit_alloc(bluegene_numpsets); */ +/* info("setting %d-%d of %d", */ +/* io_start, io_start+io_cnt, bluegene_numpsets); */ +/* bit_nset(ionode_bitmap, io_start, io_start+io_cnt); */ + /* we have to handle each nodecard separately to make + sure we don't create holes in the system */ + down_nodecard(node_name, io_start); } - if(ionode_bitmap) { - down_sub_node_blocks(coord, ionode_bitmap); - up_sub_node_blocks(coord, ionode_bitmap); - } else { - ListIterator itr = NULL; - slurm_mutex_lock(&block_state_mutex); - itr = list_iterator_create(bg_list); - while ((bg_record = list_next(itr))) { - if(bg_record->state != BLOCK_ERROR_STATE) - continue; - - if(!bit_test(bg_record->bitmap, bp_bit)) - continue; + /* this code is here to bring up a block after it is in an + error state. It is commented out because it hasn't been + tested very well yet. If you ever want to use this code + there should probably be a configurable option in the + bluegene.conf file that gives you an option as to have this + happen or not automatically. + */ +/* if(ionode_bitmap) { */ +/* info("got ionode_bitmap"); */ + +/* bit_not(ionode_bitmap); */ +/* up_nodecard(node_name, ionode_bitmap); */ +/* } else { */ +/* int ret = 0; */ +/* info("no ionode_bitmap"); */ +/* ListIterator itr = NULL; */ +/* slurm_mutex_lock(&block_state_mutex); */ +/* itr = list_iterator_create(bg_list); */ +/* while ((bg_record = list_next(itr))) { */ +/* if(bg_record->job_running != BLOCK_ERROR_STATE) */ +/* continue; */ - bg_record->job_running = NO_JOB_RUNNING; - bg_record->state = RM_PARTITION_FREE; - } - list_iterator_destroy(itr); - slurm_mutex_unlock(&block_state_mutex); - } +/* if(!bit_test(bg_record->bitmap, bp_bit)) */ +/* continue; */ +/* info("bringing %s back to service", */ +/* bg_record->bg_block_id); */ +/* bg_record->job_running = NO_JOB_RUNNING; */ +/* bg_record->state = RM_PARTITION_FREE; */ +/* last_bg_update = time(NULL); */ +/* } */ +/* list_iterator_destroy(itr); */ +/* slurm_mutex_unlock(&block_state_mutex); */ + +/* /\* FIX ME: This needs to call the opposite of */ +/* slurm_drain_nodes which does not yet exist. */ +/* *\/ */ +/* if((ret = node_already_down(node_name))) { */ +/* /\* means it was drained *\/ */ +/* if(ret == 2) { */ +/* /\* debug("node %s put back into service after " *\/ */ +/* /\* "being in an error state", *\/ */ +/* /\* node_name); *\/ */ +/* } */ +/* } */ +/* } */ -cleanup: +clean_up: xfree(node_name); - if(ionode_bitmap) - FREE_NULL_BITMAP(ionode_bitmap); +/* if(ionode_bitmap) */ +/* FREE_NULL_BITMAP(ionode_bitmap); */ free(bp_id); return rc; @@ -322,15 +338,8 @@ static void _test_down_nodes(my_bluegene_t *my_bg) { int bp_num, i, rc; rm_BP_t *my_bp; - rm_BP_state_t bp_state; - rm_location_t bp_loc; - char down_node_list[BUFSIZE]; - char bg_down_node[128]; - char reason[128], time_str[32]; - time_t now = time(NULL); - debug2("Running _test_down_nodes"); - down_node_list[0] = '\0'; + debug("Running _test_down_nodes"); if ((rc = bridge_get_data(my_bg, RM_BPNum, &bp_num)) != STATUS_OK) { error("bridge_get_data(RM_BPNum): %s", bg_err_str(rc)); bp_num = 0; @@ -352,60 +361,8 @@ static void _test_down_nodes(my_bluegene_t *my_bg) } } - if ((rc = bridge_get_data(my_bp, RM_BPState, &bp_state)) - != STATUS_OK) { - error("bridge_get_data(RM_BPState): %s", - bg_err_str(rc)); - continue; - } - - if (bp_state == RM_BP_UP) { - _test_down_nodecards(my_bp); - continue; - } - - if ((rc = bridge_get_data(my_bp, RM_BPLoc, &bp_loc)) - != STATUS_OK) { - error("bridge_get_data(RM_BPLoc): %s", bg_err_str(rc)); - continue; - } - - /* we only want to look at the ones in the system */ - if(bp_loc.X >= DIM_SIZE[X] - || bp_loc.Y >= DIM_SIZE[Y] - || bp_loc.Z >= DIM_SIZE[Z]) - continue; - - - snprintf(bg_down_node, sizeof(bg_down_node), "%s%c%c%c", - bg_slurm_node_prefix, - alpha_num[bp_loc.X], alpha_num[bp_loc.Y], - alpha_num[bp_loc.Z]); - - - if (node_already_down(bg_down_node)) - continue; - - debug("_test_down_nodes: %s in state %s", - bg_down_node, _convert_bp_state(bp_state)); - - if ((strlen(down_node_list) + strlen(bg_down_node) - + 2) - < BUFSIZE) { - if (down_node_list[0] != '\0') - strcat(down_node_list,","); - strcat(down_node_list, bg_down_node); - } else - error("down_node_list overflow"); - } - if (down_node_list[0]) { - slurm_make_time_str(&now, time_str, sizeof(time_str)); - snprintf(reason, sizeof(reason), - "select_bluegene: MMCS state not UP [SLURM@%s]", - time_str); - slurm_drain_nodes(down_node_list, reason); + _test_down_nodecards(my_bp); } - } /* Test for switches that are not UP in MMCS, @@ -469,7 +426,7 @@ static void _test_down_switches(my_bluegene_t *my_bg) #endif /* Determine if specific slurm node is already in DOWN or DRAIN state */ -extern bool node_already_down(char *node_name) +extern int node_already_down(char *node_name) { uint16_t base_state; struct node_record *node_ptr = find_node_record(node_name); @@ -477,14 +434,16 @@ extern bool node_already_down(char *node_name) if (node_ptr) { base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND); - if ((base_state == NODE_STATE_DOWN) - || (base_state == NODE_STATE_DRAIN)) - return true; + + if(base_state & NODE_STATE_DRAIN) + return 2; + else if (base_state == NODE_STATE_DOWN) + return 1; else - return false; + return 0; } - return false; + return 0; } /* @@ -518,16 +477,8 @@ extern int check_block_bp_states(char *bg_block_id) #ifdef HAVE_BG_FILES rm_partition_t *block_ptr = NULL; rm_BP_t *bp_ptr = NULL; - char *bpid = NULL; int bp_cnt = 0; int i = 0; - int *coord = NULL; - rm_BP_state_t bp_state; - char bg_down_node[128], reason[128], time_str[32]; - char down_node_list[BUFSIZE]; - time_t now = time(NULL); - - down_node_list[0] = '\0'; if ((rc = bridge_get_block(bg_block_id, &block_ptr)) != STATUS_OK) { error("Block %s doesn't exist.", bg_block_id); @@ -566,60 +517,13 @@ extern int check_block_bp_states(char *bg_block_id) break; } } - if ((rc = bridge_get_data(bp_ptr, RM_BPState, &bp_state)) - != STATUS_OK) { - error("bridge_get_data(RM_BPLoc): %s", - bg_err_str(rc)); - rc = SLURM_ERROR; - break; - } - if(bp_state == RM_BP_UP) - continue; - rc = SLURM_ERROR; - if ((rc = bridge_get_data(bp_ptr, RM_BPID, &bpid)) - != STATUS_OK) { - error("bridge_get_data(RM_BPID): %s", - bg_err_str(rc)); - break; - } - coord = find_bp_loc(bpid); - - if(!coord) { - fatal("Could not find coordinates for " - "BP ID %s", (char *) bpid); - } - free(bpid); - - snprintf(bg_down_node, sizeof(bg_down_node), "%s%c%c%c", - bg_slurm_node_prefix, - alpha_num[coord[X]], alpha_num[coord[Y]], - alpha_num[coord[Z]]); - - - if (node_already_down(bg_down_node)) - continue; - debug("check_block_bp_states: %s in state %s", - bg_down_node, _convert_bp_state(bp_state)); - if ((strlen(down_node_list) + strlen(bg_down_node) + 2) - < BUFSIZE) { - if (down_node_list[0] != '\0') - strcat(down_node_list,","); - strcat(down_node_list, bg_down_node); - } else - error("down_node_list overflow"); + _test_down_nodecards(bp_ptr); } cleanup: bridge_free_block(block_ptr); done: - if (down_node_list[0]) { - slurm_make_time_str(&now, time_str, sizeof(time_str)); - snprintf(reason, sizeof(reason), - "select_bluegene: MMCS state not UP [SLURM@%s]", - time_str); - slurm_drain_nodes(down_node_list, reason); - } #endif return rc; diff --git a/src/plugins/select/bluegene/plugin/state_test.h b/src/plugins/select/bluegene/plugin/state_test.h index 7dff652883be070d5b52710c3d6aedaf9be4d87a..0cd0ee5af49fa3d9de4c5a3380491f97eac2683d 100644 --- a/src/plugins/select/bluegene/plugin/state_test.h +++ b/src/plugins/select/bluegene/plugin/state_test.h @@ -39,8 +39,9 @@ #ifndef _STATE_TEST_H_ #define _STATE_TEST_H_ -/* Determine if specific slurm node is already in DOWN or DRAIN state */ -extern bool node_already_down(char *node_name); +/* Determine if specific slurm node is already in DOWN or DRAIN ret (1) or + * FAIL ret (2) state idle ret (0) */ +extern int node_already_down(char *node_name); /* * Search MMCS for failed switches and nodes. Failed resources are DRAINED in diff --git a/src/sreport/job_reports.c b/src/sreport/job_reports.c index 8901b27899d0b3e54fc1e68b509323dab05ab474..af28391a1f95a261c5d8ab13360106eca6e3fc72 100644 --- a/src/sreport/job_reports.c +++ b/src/sreport/job_reports.c @@ -83,6 +83,7 @@ enum { static List print_fields_list = NULL; /* types are of print_field_t */ static List grouping_print_fields_list = NULL; /* types are of print_field_t */ static int print_job_count = 0; +static bool flat_view = false; static void _destroy_local_grouping(void *object) { @@ -115,6 +116,58 @@ static void _destroy_cluster_grouping(void *object) } } +/* + * Comparator used for sorting clusters alphabetically + * + * returns: 1: cluster_a > cluster_b + * 0: cluster_a == cluster_b + * -1: cluster_a < cluster_b + * + */ +extern int _sort_cluster_grouping_dec(cluster_grouping_t *cluster_a, + cluster_grouping_t *cluster_b) +{ + int diff = 0; + + if(!cluster_a->cluster || !cluster_b->cluster) + return 0; + + diff = strcmp(cluster_a->cluster, cluster_b->cluster); + + if (diff > 0) + return 1; + else if (diff < 0) + return -1; + + return 0; +} + +/* + * Comparator used for sorting clusters alphabetically + * + * returns: 1: acct_a > acct_b + * 0: acct_a == acct_b + * -1: acct_a < acct_b + * + */ +extern int _sort_acct_grouping_dec(acct_grouping_t *acct_a, + acct_grouping_t *acct_b) +{ + int diff = 0; + + if(!acct_a->acct || !acct_b->acct) + return 0; + + diff = strcmp(acct_a->acct, acct_b->acct); + + if (diff > 0) + return 1; + else if (diff < 0) + return -1; + + return 0; +} + /* returns number of objects added to list */ extern int _addto_uid_char_list(List char_list, char *names) { @@ -247,6 +300,10 @@ static int _set_cond(int *start, int argc, char *argv[], MAX(command_len, 2))) { print_job_count = 1; continue; + } else if (!end && !strncasecmp (argv[i], "FlatView", + MAX(command_len, 2))) { + flat_view = true; + continue; } else if(!end || !strncasecmp (argv[i], "Clusters", MAX(command_len, 1))) { @@ -273,7 +330,7 @@ static int _set_cond(int *start, int argc, char *argv[], job_cond->usage_end = parse_time(argv[i]+end, 1); set = 1; } else if (!strncasecmp (argv[i], "Format", - MAX(command_len, 1))) { + MAX(command_len, 2))) { if(format_list) slurm_addto_char_list(format_list, argv[i]+end); } else if (!strncasecmp (argv[i], "Gid", MAX(command_len, 2))) { @@ -620,21 +677,6 @@ extern int job_sizes_grouped_by_top_acct(int argc, char *argv[]) goto end_it; } - memset(&assoc_cond, 0, sizeof(acct_association_cond_t)); - assoc_cond.id_list = job_cond->associd_list; - assoc_cond.cluster_list = job_cond->cluster_list; - /* don't limit associations to having the partition_list */ - //assoc_cond.partition_list = job_cond->partition_list; - if(!job_cond->acct_list || !list_count(job_cond->acct_list)) { - job_cond->acct_list = list_create(NULL); - list_append(job_cond->acct_list, "root"); - } - assoc_cond.parent_acct_list = job_cond->acct_list; - - - assoc_list = acct_storage_g_get_associations(db_conn, my_uid, - &assoc_cond); - if(print_fields_have_header) { char start_char[20]; char end_char[20]; @@ -662,6 +704,22 @@ extern int job_sizes_grouped_by_top_acct(int argc, char *argv[]) cluster_itr = list_iterator_create(cluster_list); group_itr = list_iterator_create(grouping_list); + if(flat_view) + goto no_assocs; + + memset(&assoc_cond, 0, sizeof(acct_association_cond_t)); + assoc_cond.id_list = job_cond->associd_list; + assoc_cond.cluster_list = job_cond->cluster_list; + /* don't limit associations to having the partition_list */ + //assoc_cond.partition_list = job_cond->partition_list; + if(!job_cond->acct_list || !list_count(job_cond->acct_list)) { + job_cond->acct_list = list_create(NULL); + list_append(job_cond->acct_list, "root"); + } + assoc_cond.parent_acct_list = job_cond->acct_list; + assoc_list = acct_storage_g_get_associations(db_conn, my_uid, + &assoc_cond); + if(!assoc_list) { debug2(" No assoc list given.\n"); goto no_assocs; @@ -758,19 +816,21 @@ no_assocs: /* here we are only looking for groups that * were added with the associations above */ - continue; -/* cluster_group = */ -/* xmalloc(sizeof(cluster_grouping_t)); */ -/* cluster_group->cluster = xstrdup(local_cluster); */ -/* cluster_group->acct_list = */ -/* list_create(_destroy_acct_grouping); */ -/* list_append(cluster_list, cluster_group); */ + if(!flat_view) + continue; + cluster_group = + xmalloc(sizeof(cluster_grouping_t)); + cluster_group->cluster = xstrdup(local_cluster); + cluster_group->acct_list = + list_create(_destroy_acct_grouping); + list_append(cluster_list, cluster_group); } acct_itr = list_iterator_create(cluster_group->acct_list); while((acct_group = list_next(acct_itr))) { - if(acct_group->lft != (uint32_t)NO_VAL - && job->lft != (uint32_t)NO_VAL) { + if(!flat_view + && (acct_group->lft != (uint32_t)NO_VAL) + && (job->lft != (uint32_t)NO_VAL)) { /* keep separate since we don't want * to so a strcmp if we don't have to */ @@ -783,34 +843,36 @@ no_assocs: list_iterator_destroy(acct_itr); if(!acct_group) { - //char *group = NULL; - //uint32_t last_size = 0; + char *group = NULL; + uint32_t last_size = 0; /* here we are only looking for groups that * were added with the associations above */ - continue; -/* acct_group = xmalloc(sizeof(acct_grouping_t)); */ -/* acct_group->acct = xstrdup(local_account); */ -/* acct_group->groups = */ -/* list_create(_destroy_local_grouping); */ -/* list_append(cluster_group->acct_list, acct_group); */ - -/* while((group = list_next(group_itr))) { */ -/* local_group = xmalloc(sizeof(local_grouping_t)); */ -/* local_group->jobs = list_create(NULL); */ -/* local_group->min_size = last_size; */ -/* last_size = atoi(group); */ -/* local_group->max_size = last_size-1; */ -/* list_append(acct_group->groups, local_group); */ -/* } */ -/* if(last_size) { */ -/* local_group = xmalloc(sizeof(local_grouping_t)); */ -/* local_group->jobs = list_create(NULL); */ -/* local_group->min_size = last_size; */ -/* local_group->max_size = INFINITE; */ -/* list_append(acct_group->groups, local_group); */ -/* } */ -/* list_iterator_reset(group_itr); */ + if(!flat_view) + continue; + + acct_group = xmalloc(sizeof(acct_grouping_t)); + acct_group->acct = xstrdup(local_account); + acct_group->groups = + list_create(_destroy_local_grouping); + list_append(cluster_group->acct_list, acct_group); + + while((group = list_next(group_itr))) { + local_group = xmalloc(sizeof(local_grouping_t)); + local_group->jobs = list_create(NULL); + local_group->min_size = last_size; + last_size = atoi(group); + local_group->max_size = last_size-1; + list_append(acct_group->groups, local_group); + } + if(last_size) { + local_group = xmalloc(sizeof(local_grouping_t)); + local_group->jobs = list_create(NULL); + local_group->min_size = last_size; + local_group->max_size = INFINITE; + list_append(acct_group->groups, local_group); + } + list_iterator_reset(group_itr); } local_itr = list_iterator_create(acct_group->groups); @@ -837,8 +899,12 @@ no_assocs: itr = list_iterator_create(print_fields_list); itr2 = list_iterator_create(grouping_print_fields_list); + list_sort(cluster_list, (ListCmpF)_sort_cluster_grouping_dec); list_iterator_reset(cluster_itr); while((cluster_group = list_next(cluster_itr))) { + + list_sort(cluster_group->acct_list, + (ListCmpF)_sort_acct_grouping_dec); acct_itr = list_iterator_create(cluster_group->acct_list); while((acct_group = list_next(acct_itr))) { @@ -1169,8 +1235,11 @@ no_assocs: itr = list_iterator_create(print_fields_list); itr2 = list_iterator_create(grouping_print_fields_list); + list_sort(cluster_list, (ListCmpF)_sort_cluster_grouping_dec); list_iterator_reset(cluster_itr); while((cluster_group = list_next(cluster_itr))) { + list_sort(cluster_group->acct_list, + (ListCmpF)_sort_acct_grouping_dec); acct_itr = list_iterator_create(cluster_group->acct_list); while((acct_group = list_next(acct_itr))) { diff --git a/src/sreport/sreport.c b/src/sreport/sreport.c index ce12b4ee3c463a44f952502690a885b9972af55b..cc7626d69fc0e177b5c0f7ce1e77d96ed4dd7ead 100644 --- a/src/sreport/sreport.c +++ b/src/sreport/sreport.c @@ -699,7 +699,14 @@ sreport [<OPTION>] [<COMMAND>] \n\ of accounts under those specified will be\n\ displayed, not the accounts specified. \n\ In the SizesByAccount reports the default\n\ - for accounts is root. \n\ + for accounts is root. This explanation \n\ + does not apply when ran with the FlatView\n\ + option. \n\ + - FlatView - When used with the SizesbyAccount \n\ + will not group accounts in a \n\ + hierarchical level, but print each \n\ + account where jobs ran on a separate \n\ + line without any hierarchy. \n\ - GID=<OPT> - List of group ids to include in report. \n\ Default is all. \n\ - Grouping=<OPT> - Comma separated list of size groupings. \n\ @@ -709,8 +716,9 @@ sreport [<OPTION>] [<COMMAND>] \n\ Default is all. \n\ - Partitions=<OPT> - List of partitions jobs ran on to include\n\ in report. Default is all. \n\ - - PrintJobCount - When used with the Sizes report will print\n\ - number of jobs ran instead of time used. \n\ + - PrintJobCount - When used with the any Sizes report \n\ + will print number of jobs ran instead of \n\ + time used. \n\ - Users=<OPT> - List of users jobs to include in report. \n\ Default is all. \n\ - Wckeys=<OPT> - List of wckeys to use for the report. \n\