From bba4755872ed30ef3e95dd774d6593d4d2a68900 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Thu, 11 Jan 2007 20:09:18 +0000 Subject: [PATCH] BLUEGENE - added state save and recover for bluegene systems --- NEWS | 2 + src/api/node_select_info.h | 4 + src/common/node_select.c | 12 +- .../block_allocator/block_allocator.c | 17 +- .../block_allocator/block_allocator.h | 1 - .../select/bluegene/plugin/bg_block_info.c | 1 + .../select/bluegene/plugin/bg_job_place.c | 2 +- .../select/bluegene/plugin/block_sys.c | 20 +- src/plugins/select/bluegene/plugin/bluegene.c | 28 +- .../select/bluegene/plugin/select_bluegene.c | 343 +++++++++++++++++- src/sinfo/sinfo.c | 2 +- src/slurmctld/controller.c | 13 - src/slurmctld/read_config.c | 5 +- src/slurmctld/state_save.c | 2 +- 14 files changed, 406 insertions(+), 46 deletions(-) diff --git a/NEWS b/NEWS index 70ba28cc707..0111487188e 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,8 @@ documents those changes that are of interest to users and admins. ============================== -- BLUEGENE - Added correct node info for sinfo and sview for viewing allocated nodes in a partition. + -- BLUEGENE - Added state save on slurmctld shutdown of blocks in an error + state on real systems and total block config on emulation systems. * Changes in SLURM 1.2.0-pre11 ============================== diff --git a/src/api/node_select_info.h b/src/api/node_select_info.h index 8cebce17edc..fe3120e45a2 100644 --- a/src/api/node_select_info.h +++ b/src/api/node_select_info.h @@ -61,6 +61,10 @@ typedef struct { int *bp_inx; /* list index pairs into node_table for *nodes: * start_range_1, end_range_1, * start_range_2, .., -1 */ + int *ionode_inx; /* list index pairs for ionodes in the + * node listed for *ionodes: + * start_range_1, end_range_1, + * start_range_2, .., -1 */ char *blrtsimage; /* BlrtsImage for this block */ char *linuximage; /* LinuxImage for this block */ char *mloaderimage; /* mloaderImage for this block */ diff --git a/src/common/node_select.c b/src/common/node_select.c index b87fb107204..878eae17591 100644 --- a/src/common/node_select.c +++ b/src/common/node_select.c @@ -602,6 +602,7 @@ static void _free_node_info(bg_info_record_t *bg_info_record) xfree(bg_info_record->owner_name); xfree(bg_info_record->bg_block_id); xfree(bg_info_record->bp_inx); + xfree(bg_info_record->ionode_inx); xfree(bg_info_record->blrtsimage); xfree(bg_info_record->linuximage); xfree(bg_info_record->mloaderimage); @@ -645,6 +646,13 @@ static int _unpack_node_info(bg_info_record_t *bg_info_record, Buf buffer) bg_info_record->bp_inx = bitfmt2int(bp_inx_str); xfree(bp_inx_str); } + safe_unpackstr_xmalloc(&bp_inx_str, &uint16_tmp, buffer); + if (bp_inx_str == NULL) { + bg_info_record->ionode_inx = bitfmt2int(""); + } else { + bg_info_record->ionode_inx = bitfmt2int(bp_inx_str); + xfree(bp_inx_str); + } safe_unpackstr_xmalloc(&bg_info_record->blrtsimage, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&bg_info_record->linuximage, &uint16_tmp, @@ -1219,9 +1227,8 @@ extern int select_g_unpack_node_info(node_select_info_msg_t ** buf->bg_info_array = xmalloc(sizeof(bg_info_record_t) * buf->record_count); record_count = buf->record_count; - for(i=0; i<record_count; i++) { - if (_unpack_node_info(&(buf->bg_info_array[i]), buffer)) + if (_unpack_node_info(&(buf->bg_info_array[i]), buffer)) goto unpack_error; } *node_select_info_msg_pptr = buf; @@ -1250,6 +1257,7 @@ extern int select_g_free_node_info(node_select_info_msg_t ** buf->record_count = 0; for(i=0; i<buf->record_count; i++) _free_node_info(&(buf->bg_info_array[i])); + xfree(buf->bg_info_array); xfree(buf); return SLURM_SUCCESS; } diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c index e31e4044432..9d309de61a6 100644 --- a/src/plugins/select/bluegene/block_allocator/block_allocator.c +++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c @@ -233,7 +233,9 @@ extern int parse_blockreq(void **dest, slurm_parser_enum_t type, char temp[BUFSIZE]; tbl = s_p_hashtbl_create(block_options); s_p_parse_line(tbl, *leftover, leftover); - + if(!value) { + return 0; + } n = xmalloc(sizeof(blockreq_t)); hl = hostlist_create(value); hostlist_ranged_string(hl, BUFSIZE, temp); @@ -1275,6 +1277,7 @@ extern int check_and_set_node_list(List nodes) rc = SLURM_ERROR; goto end_it; } + if(ba_node->used) curr_ba_node->used = true; for(i=0; i<BA_SYSTEM_DIMENSIONS; i++) { @@ -1347,6 +1350,7 @@ extern char *set_bg_block(List results, int *start, grid[start[X]]; #endif + if(!ba_node) return NULL; @@ -1356,6 +1360,16 @@ extern char *set_bg_block(List results, int *start, send_results = 1; list_append(results, ba_node); + if(conn_type == SELECT_SMALL) { + /* adding the ba_node and ending */ + ba_node->used = true; + name = xmalloc(4); + snprintf(name, 4, "%d%d%d", + ba_node->coord[X], + ba_node->coord[Y], + ba_node->coord[Z]); + goto end_it; + } found = _find_x_path(results, ba_node, ba_node->coord, ba_node->coord, @@ -3374,7 +3388,6 @@ static char *_set_internal_wires(List nodes, int size, int conn_type) for(i=0;i<count;i++) { if(!ba_node[i]->used) { ba_node[i]->used=1; - ba_node[i]->conn_type=conn_type; if(ba_node[i]->letter == '.') { ba_node[i]->letter = letters[color_count%62]; ba_node[i]->color = colors[color_count%6]; diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.h b/src/plugins/select/bluegene/block_allocator/block_allocator.h index 89b7cdda608..95c6298d097 100644 --- a/src/plugins/select/bluegene/block_allocator/block_allocator.h +++ b/src/plugins/select/bluegene/block_allocator/block_allocator.h @@ -182,7 +182,6 @@ typedef struct { int color; int index; int state; - int conn_type; int phys_x; } ba_node_t; diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c index 026fbecb3e5..106ce5b2067 100644 --- a/src/plugins/select/bluegene/plugin/bg_block_info.c +++ b/src/plugins/select/bluegene/plugin/bg_block_info.c @@ -221,6 +221,7 @@ extern void pack_block(bg_record_t *bg_record, Buf buffer) pack16((uint16_t)bg_record->nodecard, buffer); pack32((uint32_t)bg_record->node_cnt, buffer); pack_bit_fmt(bg_record->bitmap, buffer); + pack_bit_fmt(bg_record->ionode_bitmap, buffer); packstr(bg_record->blrtsimage, buffer); packstr(bg_record->linuximage, buffer); packstr(bg_record->mloaderimage, buffer); diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index a18322bfdb3..7a976c47ba4 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -756,7 +756,7 @@ try_again: } bit_and(slurm_block_bitmap, tmp_bitmap); - bit_free(tmp_bitmap); + FREE_NULL_BITMAP(tmp_bitmap); xfree(request.save_name); rc = SLURM_SUCCESS; goto end_it; diff --git a/src/plugins/select/bluegene/plugin/block_sys.c b/src/plugins/select/bluegene/plugin/block_sys.c index da6e2a057be..51261c0dd3d 100755 --- a/src/plugins/select/bluegene/plugin/block_sys.c +++ b/src/plugins/select/bluegene/plugin/block_sys.c @@ -220,9 +220,23 @@ static int _post_allocate(bg_record_t *bg_record) error("bridge_free_block(): %s", bg_err_str(rc)); #else static int block_inx = 0; - bg_record->bg_block_id = xmalloc(8); - snprintf(bg_record->bg_block_id, 8, - "RMP%d", block_inx++); + int i=0, temp = 0; + if(bg_record->bg_block_id) { + while((bg_record->bg_block_id[i] > '9' + || bg_record->bg_block_id[i] < '0') + && (bg_record->bg_block_id[i])) + i++; + if(bg_record->bg_block_id[i]) { + temp = atoi(bg_record->bg_block_id+i)+1; + if(temp > block_inx) + block_inx = temp; + info("first new block inx will now be %d", block_inx); + } + } else { + bg_record->bg_block_id = xmalloc(8); + snprintf(bg_record->bg_block_id, 8, + "RMP%d", block_inx++); + } #endif return rc; diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 7f75f3d23b3..9857c03a234 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -273,10 +273,8 @@ extern void destroy_bg_record(void *object) xfree(bg_record->target_name); if(bg_record->bg_block_list) list_destroy(bg_record->bg_block_list); - if(bg_record->bitmap) - bit_free(bg_record->bitmap); - if(bg_record->ionode_bitmap) - bit_free(bg_record->ionode_bitmap); + FREE_NULL_BITMAP(bg_record->bitmap); + FREE_NULL_BITMAP(bg_record->ionode_bitmap); xfree(bg_record->blrtsimage); xfree(bg_record->linuximage); @@ -435,8 +433,7 @@ extern void process_nodes(bg_record_t *bg_record) if ((bg_record->geo[X] == DIM_SIZE[X]) && (bg_record->geo[Y] == DIM_SIZE[Y]) - && (bg_record->geo[Z] == DIM_SIZE[Z])) - { + && (bg_record->geo[Z] == DIM_SIZE[Z])) { bg_record->full_block = 1; } @@ -497,15 +494,13 @@ extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record) sec_record->start[i] = fir_record->start[i]; } - if(sec_record->bitmap) - bit_free(sec_record->bitmap); + FREE_NULL_BITMAP(sec_record->bitmap); if(fir_record->bitmap && (sec_record->bitmap = bit_copy(fir_record->bitmap)) == NULL) { error("Unable to copy bitmap for %s", fir_record->nodes); sec_record->bitmap = NULL; } - if(sec_record->ionode_bitmap) - bit_free(sec_record->ionode_bitmap); + FREE_NULL_BITMAP(sec_record->ionode_bitmap); if(fir_record->ionode_bitmap && (sec_record->ionode_bitmap = bit_copy(fir_record->ionode_bitmap)) == NULL) { @@ -716,10 +711,10 @@ extern bool blocks_overlap(bg_record_t *rec_a, bg_record_t *rec_b) my_bitmap = bit_copy(rec_a->bitmap); bit_and(my_bitmap, rec_b->bitmap); if (bit_ffs(my_bitmap) == -1) { - bit_free(my_bitmap); + FREE_NULL_BITMAP(my_bitmap); return false; } - bit_free(my_bitmap); + FREE_NULL_BITMAP(my_bitmap); if(rec_a->quarter != (uint16_t) NO_VAL) { if(rec_b->quarter == (uint16_t) NO_VAL) @@ -972,8 +967,7 @@ extern int create_defined_blocks(bg_layout_t overlapped) reset_ba_system(); if(bg_list) { itr = list_iterator_create(bg_list); - while ((bg_record = (bg_record_t *) list_next(itr)) - != NULL) { + while((bg_record = list_next(itr))) { if(bg_found_block_list) { itr_found = list_iterator_create( bg_found_block_list); @@ -1185,8 +1179,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) bg_record->bg_block_id); list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); - if(my_bitmap) - bit_free(my_bitmap); + FREE_NULL_BITMAP(my_bitmap); return SLURM_ERROR; } //set_node_list(bg_record->bg_block_list); @@ -1194,8 +1187,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) } } list_iterator_destroy(itr); - if(my_bitmap) - bit_free(my_bitmap); + FREE_NULL_BITMAP(my_bitmap); } else { debug("No list was given"); } diff --git a/src/plugins/select/bluegene/plugin/select_bluegene.c b/src/plugins/select/bluegene/plugin/select_bluegene.c index 05259e2e79b..977edfea162 100644 --- a/src/plugins/select/bluegene/plugin/select_bluegene.c +++ b/src/plugins/select/bluegene/plugin/select_bluegene.c @@ -38,9 +38,13 @@ \*****************************************************************************/ #include "bluegene.h" - +#include <fcntl.h> + #define HUGE_BUF_SIZE (1024*16) +/* Change BLOCK_STATE_VERSION value when changing the state save + * format i.e. pack_block() */ +#define BLOCK_STATE_VERSION "VER000" /* global */ int procs_per_node = 512; @@ -227,12 +231,345 @@ extern int fini ( void ) /* We rely upon DB2 to save and restore BlueGene state */ extern int select_p_state_save(char *dir_name) { + ListIterator itr; + bg_record_t *bg_record = NULL; + int error_code = 0, log_fd; + char *old_file, *new_file, *reg_file; + uint32_t blocks_packed = 0, tmp_offset, block_offset; + Buf buffer = init_buf(BUF_SIZE); + DEF_TIMERS; + + START_TIMER; + /* write header: time */ + packstr(BLOCK_STATE_VERSION, buffer); + block_offset = get_buf_offset(buffer); + pack32(blocks_packed, buffer); + pack_time(time(NULL), buffer); + + /* write block records to buffer */ + slurm_mutex_lock(&block_state_mutex); + itr = list_iterator_create(bg_list); + while((bg_record = list_next(itr))) { + /* on real bgl systems we only want to keep track of + * the blocks in an error state + */ +#ifdef HAVE_BG_FILES + if(bg_record->state != RM_PARTITION_ERROR) + continue; +#endif + xassert(bg_record->bg_block_id != NULL); + + pack_block(bg_record, buffer); + blocks_packed++; + } + list_iterator_destroy(itr); + slurm_mutex_unlock(&block_state_mutex); + tmp_offset = get_buf_offset(buffer); + set_buf_offset(buffer, block_offset); + pack32(blocks_packed, buffer); + set_buf_offset(buffer, tmp_offset); + /* Maintain config read lock until we copy state_save_location *\ + \* unlock_slurmctld(part_read_lock); - see below */ + + /* write the buffer to file */ + old_file = xstrdup(slurmctld_conf.state_save_location); + xstrcat(old_file, "/block_state.old"); + reg_file = xstrdup(slurmctld_conf.state_save_location); + xstrcat(reg_file, "/block_state"); + new_file = xstrdup(slurmctld_conf.state_save_location); + xstrcat(new_file, "/block_state.new"); + log_fd = creat(new_file, 0600); + if (log_fd == 0) { + error("Can't save state, error creating file %s, %m", + new_file); + error_code = errno; + } else { + int pos = 0, nwrite = get_buf_offset(buffer), amount; + char *data = (char *)get_buf_data(buffer); + + while (nwrite > 0) { + amount = write(log_fd, &data[pos], nwrite); + if ((amount < 0) && (errno != EINTR)) { + error("Error writing file %s, %m", new_file); + error_code = errno; + break; + } + nwrite -= amount; + pos += amount; + } + fsync(log_fd); + close(log_fd); + } + if (error_code) + (void) unlink(new_file); + else { /* file shuffle */ + (void) unlink(old_file); + (void) link(reg_file, old_file); + (void) unlink(reg_file); + (void) link(new_file, reg_file); + (void) unlink(new_file); + } + xfree(old_file); + xfree(reg_file); + xfree(new_file); + + free_buf(buffer); + END_TIMER; + debug3("select_p_state_save %s", TIME_STR); return SLURM_SUCCESS; } extern int select_p_state_restore(char *dir_name) { - return SLURM_SUCCESS; + int error_code = SLURM_SUCCESS; + int state_fd, i, j=0; + char *state_file = NULL; + Buf buffer = NULL; + char *data = NULL; + int data_size = 0; + node_select_info_msg_t *node_select_ptr = NULL; + ListIterator itr; + bg_record_t *bg_record = NULL; + bg_info_record_t *bg_info_record = NULL; + bitstr_t *node_bitmap = NULL, *ionode_bitmap = NULL; + int geo[BA_SYSTEM_DIMENSIONS]; + char temp[256]; + List results = NULL; + int data_allocated, data_read = 0; + char *ver_str = NULL; + uint16_t ver_str_len; + struct passwd *pw_ent = NULL; + int blocks = 0; + + debug("bluegene: select_p_state_restore"); + + if(!dir_name) { + debug2("Starting bluegene with clean slate"); + return SLURM_SUCCESS; + } + state_file = xstrdup(dir_name); + xstrcat(state_file, "/block_state"); + state_fd = open(state_file, O_RDONLY); + if(state_fd < 0) { + error("No block state file (%s) to recover", state_file); + xfree(state_file); + return SLURM_SUCCESS; + } else { + data_allocated = BUF_SIZE; + data = xmalloc(data_allocated); + while (1) { + data_read = read(state_fd, &data[data_size], + BUF_SIZE); + if (data_read < 0) { + if (errno == EINTR) + continue; + else { + error("Read error on %s: %m", + state_file); + break; + } + } else if (data_read == 0) /* eof */ + break; + data_size += data_read; + data_allocated += data_read; + xrealloc(data, data_allocated); + } + close(state_fd); + } + xfree(state_file); + + buffer = create_buf(data, data_size); + + /* + * Check the data version so that when the format changes, we + * we don't try to unpack data using the wrong format routines + */ + if(size_buf(buffer) + >= sizeof(uint16_t) + strlen(BLOCK_STATE_VERSION)) { + char *ptr = get_buf_data(buffer); + + if (!memcmp(&ptr[sizeof(uint16_t)], BLOCK_STATE_VERSION, 3)) { + safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer); + debug3("Version string in block_state header is %s", + ver_str); + } + } + if (ver_str && (strcmp(ver_str, BLOCK_STATE_VERSION) != 0)) { + error("Can not recover block state, " + "data version incompatable"); + xfree(ver_str); + free_buf(buffer); + return EFAULT; + } + xfree(ver_str); + if(select_g_unpack_node_info(&node_select_ptr, buffer) == SLURM_ERROR) + goto unpack_error; + + reset_ba_system(); + + node_bitmap = bit_alloc(node_record_count); + ionode_bitmap = bit_alloc(bluegene_numpsets); + itr = list_iterator_create(bg_list); + for (i=0; i<node_select_ptr->record_count; i++) { + bg_info_record = &(node_select_ptr->bg_info_array[i]); + + bit_nclear(node_bitmap, 0, bit_size(node_bitmap) - 1); + bit_nclear(ionode_bitmap, 0, bit_size(ionode_bitmap) - 1); + + j = 0; + while(bg_info_record->ionode_inx[j] >= 0) { + bit_nset(ionode_bitmap, + bg_info_record->ionode_inx[j], + bg_info_record->ionode_inx[j+1]); + j += 2; + } + j = 0; + while(bg_info_record->bp_inx[j] >= 0) { + bit_nset(node_bitmap, + bg_info_record->bp_inx[j], + bg_info_record->bp_inx[j+1]); + j += 2; + } + + while((bg_record = list_next(itr))) { + if(bit_equal(bg_record->bitmap, node_bitmap) + && bit_equal(bg_record->ionode_bitmap, + ionode_bitmap)) + break; + } + list_iterator_reset(itr); + if(bg_record) { + slurm_mutex_lock(&block_state_mutex); + if(bg_info_record->state == RM_PARTITION_ERROR) + bg_record->job_running = BLOCK_ERROR_STATE; + bg_record->state = bg_info_record->state; + slurm_mutex_unlock(&block_state_mutex); + } else { + int ionodes = 0; + char *name = NULL; + /* make the record that wasn't there (only for + * dynamic systems that are in emulation mode */ +#ifdef HAVE_BG_FILES + error("Previous block %s is gone, not adding.", + bg_info_record->bg_block_id); + continue; +#endif + if(bluegene_layout_mode != LAYOUT_DYNAMIC) { + error("Only adding state save blocks in " + "Dynamic block creation Mode not " + "adding %s", + bg_info_record->bg_block_id); + continue; + } + + bg_record = xmalloc(sizeof(bg_record_t)); + bg_record->bg_block_id = + xstrdup(bg_info_record->bg_block_id); + bg_record->nodes = + xstrdup(bg_info_record->nodes); + bg_record->ionodes = + xstrdup(bg_info_record->ionodes); + bg_record->ionode_bitmap = bit_copy(ionode_bitmap); + bg_record->state = bg_info_record->state; + bg_record->quarter = bg_info_record->quarter; + bg_record->nodecard = bg_info_record->nodecard; + if(bg_info_record->state == RM_PARTITION_ERROR) + bg_record->job_running = BLOCK_ERROR_STATE; + else + bg_record->job_running = NO_JOB_RUNNING; + bg_record->bp_count = bit_size(node_bitmap); + bg_record->node_cnt = bg_info_record->node_cnt; + ionodes = bluegene_bp_node_cnt / bg_record->node_cnt; + bg_record->cpus_per_bp = procs_per_node / ionodes; + bg_record->node_use = bg_info_record->node_use; + bg_record->conn_type = bg_info_record->conn_type; + bg_record->boot_state = 0; + + process_nodes(bg_record); + + slurm_conf_lock(); + bg_record->target_name = + xstrdup(slurmctld_conf.slurm_user_name); + bg_record->user_name = + xstrdup(slurmctld_conf.slurm_user_name); + slurm_conf_unlock(); + if((pw_ent = getpwnam(bg_record->user_name)) + == NULL) { + error("getpwnam(%s): %m", + bg_record->user_name); + } else { + bg_record->user_uid = pw_ent->pw_uid; + } + + bg_record->blrtsimage = + xstrdup(bg_info_record->blrtsimage); + bg_record->linuximage = + xstrdup(bg_info_record->linuximage); + bg_record->mloaderimage = + xstrdup(bg_info_record->mloaderimage); + bg_record->ramdiskimage = + xstrdup(bg_info_record->ramdiskimage); + + for(j=0; j<BA_SYSTEM_DIMENSIONS; j++) + geo[j] = bg_record->geo[j]; + + results = list_create(NULL); + name = set_bg_block(results, + bg_record->start, + geo, + bg_record->conn_type); + if(!name) { + error("I was unable to " + "make the " + "requested block."); + list_destroy(results); + destroy_bg_record(bg_record); + continue; + } + + slurm_conf_lock(); + snprintf(temp, sizeof(temp), "%s%s", + slurmctld_conf.node_prefix, + name); + slurm_conf_unlock(); + + xfree(name); + if(strcmp(temp, bg_record->nodes)) { + fatal("given list of %s " + "but allocated %s, " + "your order might be " + "wrong in the " + "bluegene.conf", + bg_record->nodes, + temp); + } + if(bg_record->bg_block_list) + list_destroy(bg_record->bg_block_list); + bg_record->bg_block_list = + list_create(destroy_ba_node); + copy_node_path(results, bg_record->bg_block_list); + list_destroy(results); + + configure_block(bg_record); + blocks++; + list_push(bg_list, bg_record); + } + } + FREE_NULL_BITMAP(ionode_bitmap); + FREE_NULL_BITMAP(node_bitmap); + list_iterator_destroy(itr); + + sort_bg_record_inc_size(bg_list); + + info("Recovered %d blocks", blocks); + select_g_free_node_info(&node_select_ptr); + free_buf(buffer); + return error_code; + +unpack_error: + error("Incomplete block data checkpoint file"); + free_buf(buffer); + return SLURM_FAILURE; } /* Sync BG blocks to currently active jobs */ @@ -664,7 +1001,7 @@ extern int select_p_update_sub_node (update_part_msg_t *part_desc_ptr) slurm_mutex_unlock(&block_state_mutex); } list_destroy(delete_list); - bit_free(ionode_bitmap); + FREE_NULL_BITMAP(ionode_bitmap); /* This only works for the error state, not free */ diff --git a/src/sinfo/sinfo.c b/src/sinfo/sinfo.c index f7cfce4149c..599c98f4245 100644 --- a/src/sinfo/sinfo.c +++ b/src/sinfo/sinfo.c @@ -286,7 +286,7 @@ _query_server(partition_info_msg_t ** part_pptr, error_code = slurm_load_node_select(old_bg_ptr->last_update, &new_bg_ptr); if (error_code == SLURM_SUCCESS) - select_g_free_node_info(&new_bg_ptr); + select_g_free_node_info(&old_bg_ptr); else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_bg_ptr = old_bg_ptr; diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index baba1932e0b..4fd41c1552a 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -290,19 +290,6 @@ int main(int argc, char *argv[]) if (slurm_sched_init() != SLURM_SUCCESS) fatal("failed to initialize scheduling plugin"); - /* Recover node scheduler state info */ - if (recover) { - error_code = select_g_state_restore( - slurmctld_conf.state_save_location); - } else { - error_code = select_g_state_restore(NULL); - } - if (error_code != SLURM_SUCCESS ) { - error("failed to restore node selection state"); - abort(); - } - - /* * create attached thread to process RPCs */ diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 37067e10314..26a947b5c0d 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -670,6 +670,7 @@ int read_slurm_conf(int recover) char *old_sched_type = xstrdup(slurmctld_conf.schedtype); char *old_select_type = xstrdup(slurmctld_conf.select_type); char *old_switch_type = xstrdup(slurmctld_conf.switch_type); + char *state_save_dir = xstrdup(slurmctld_conf.state_save_location); slurm_ctl_conf_t *conf; select_type_plugin_info_t old_select_type_p = (select_type_plugin_info_t) slurmctld_conf.select_type_param; @@ -739,16 +740,18 @@ int read_slurm_conf(int recover) } reset_first_job_id(); (void) slurm_sched_reconfig(); + xfree(state_save_dir); } if ((select_g_node_init(node_record_table_ptr, node_record_count) != SLURM_SUCCESS) || (select_g_block_init(part_list) != SLURM_SUCCESS) + || (select_g_state_restore(state_save_dir) != SLURM_SUCCESS) || (select_g_job_init(job_list) != SLURM_SUCCESS)) { error("failed to initialize node selection plugin state"); abort(); } - + xfree(state_save_dir); reset_job_bitmaps(); /* must follow select_g_job_init() */ (void) _sync_nodes_to_jobs(); diff --git a/src/slurmctld/state_save.c b/src/slurmctld/state_save.c index 49c7fd304aa..78c288d57c7 100644 --- a/src/slurmctld/state_save.c +++ b/src/slurmctld/state_save.c @@ -15,7 +15,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than -- GitLab