diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index c9a1abc19348c50b954da679a27469d30ac6c257..e5f6bac7a7032a39587be90a25decf524b30eca3 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1597,30 +1597,53 @@ extern char* node_use_string(enum node_use_type node_use) extern char *bg_block_state_string(uint16_t state) { - static char tmp[16]; + static char tmp[25]; + char *state_str = NULL; + char *err_str = NULL; + if (state & BG_BLOCK_ERROR_FLAG) { + err_str = "Error"; + state &= (~BG_BLOCK_ERROR_FLAG); + } switch (state) { case BG_BLOCK_NAV: - return "NAV"; + if (!err_str) + state_str = "NAV"; + else { + err_str = NULL; + state_str = "Error"; + } + break; case BG_BLOCK_FREE: - return "Free"; + state_str = "Free"; + break; case BG_BLOCK_BUSY: - return "Busy"; + state_str = "Busy"; + break; case BG_BLOCK_BOOTING: - return "Boot"; + state_str = "Boot"; + break; case BG_BLOCK_REBOOTING: - return "Reboot"; + state_str = "Reboot"; + break; case BG_BLOCK_INITED: - return "Ready"; + state_str = "Ready"; + break; case BG_BLOCK_ALLOCATED: - return "Alloc"; + state_str = "Alloc"; + break; case BG_BLOCK_TERM: - return "Term"; - case BG_BLOCK_ERROR: - return "Error"; + state_str = "Term"; + break; + default: + state_str = "Unknown"; + break; } - snprintf(tmp, sizeof(tmp), "%d", state); + if (err_str) + snprintf(tmp, sizeof(tmp), "%s(%s)", err_str, state_str); + else + return state_str; return tmp; } diff --git a/src/plugins/select/bluegene/bg_core.c b/src/plugins/select/bluegene/bg_core.c index 302f3fe1997bb97c0471c29b5220fcf2ff592733..fba317a8438458b5bb9d3fa91c5c732079152f23 100644 --- a/src/plugins/select/bluegene/bg_core.c +++ b/src/plugins/select/bluegene/bg_core.c @@ -95,8 +95,8 @@ static int _post_block_free(bg_record_t *bg_record, bool restore) return SLURM_SUCCESS; } - if ((bg_record->state != BG_BLOCK_FREE) - && (bg_record->state != BG_BLOCK_ERROR)) { + if (!(bg_record->state & BG_BLOCK_ERROR_FLAG) + && (bg_record->state != BG_BLOCK_FREE)) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg; @@ -107,7 +107,7 @@ static int _post_block_free(bg_record_t *bg_record, bool restore) bg_block_state_string(bg_record->state)); slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = bg_record->bg_block_id; - block_msg.state = BG_BLOCK_ERROR; + block_msg.state |= BG_BLOCK_ERROR_FLAG; block_msg.reason = "Block would not deallocate"; slurm_mutex_unlock(&block_state_mutex); select_g_update_block(&block_msg); @@ -206,12 +206,12 @@ static void *_track_freeing_blocks(void *args) /* Fake a free since we are n deallocating state before this. */ - if ((bg_record->state != BG_BLOCK_ERROR) + if (!(bg_record->state & BG_BLOCK_ERROR_FLAG) && (retry_cnt >= 3)) bg_record->state = BG_BLOCK_FREE; #endif if ((bg_record->state == BG_BLOCK_FREE) - || (bg_record->state == BG_BLOCK_ERROR)) + || (bg_record->state & BG_BLOCK_ERROR_FLAG)) free_cnt++; else if (bg_record->state != BG_BLOCK_TERM) bg_free_block(bg_record, 0, 1); @@ -371,7 +371,8 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) we set it ourselves so break out. */ - if (bg_record->state == BG_BLOCK_ERROR) + if (bg_record->state + & BG_BLOCK_ERROR_FLAG) break; #endif if (bg_conf->slurm_debug_flags @@ -400,7 +401,7 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) /* Fake a free since we are n deallocating state before this. */ - if (bg_record->state == BG_BLOCK_ERROR) + if (bg_record->state & BG_BLOCK_ERROR_FLAG) break; else if (count >= 3) bg_record->state = BG_BLOCK_FREE; @@ -410,7 +411,7 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) if (!wait || (bg_record->state == BG_BLOCK_FREE) #ifdef HAVE_BGL - || (bg_record->state == BG_BLOCK_ERROR) + || (bg_record->state & BG_BLOCK_ERROR_FLAG) #endif ) { break; @@ -427,7 +428,7 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) rc = SLURM_SUCCESS; if ((bg_record->state == BG_BLOCK_FREE) - || (bg_record->state == BG_BLOCK_ERROR)) + || (bg_record->state & BG_BLOCK_ERROR_FLAG)) remove_from_bg_list(bg_lists->booted, bg_record); else if (count >= MAX_FREE_RETRIES) { /* Something isn't right, go mark this one in an error @@ -440,7 +441,7 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) bg_block_state_string(bg_record->state)); slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = bg_record->bg_block_id; - block_msg.state = BG_BLOCK_ERROR; + block_msg.state |= BG_BLOCK_ERROR_FLAG; block_msg.reason = "Block would not deallocate"; slurm_mutex_unlock(&block_state_mutex); select_g_update_block(&block_msg); @@ -645,7 +646,7 @@ extern int load_state_file(List curr_block_list, char *dir_name) /* we only care about the states we need here * everthing else should have been set up already */ - if (block_info->state == BG_BLOCK_ERROR) { + if (block_info->state & BG_BLOCK_ERROR_FLAG) { slurm_mutex_lock(&block_state_mutex); if ((bg_record = find_bg_record_in_list( curr_block_list, diff --git a/src/plugins/select/bluegene/bg_enums.h b/src/plugins/select/bluegene/bg_enums.h index 6cb7cb7a89cd1b71d45d409dfb6b4058bebd0f76..330032b643328ae29391b645fc0e6161973d2880 100644 --- a/src/plugins/select/bluegene/bg_enums.h +++ b/src/plugins/select/bluegene/bg_enums.h @@ -98,7 +98,6 @@ typedef enum { BG_BLOCK_INITED, // Block is initialized BG_BLOCK_REBOOTING, // Block is rebooting BG_BLOCK_TERM, // Block is terminating - BG_BLOCK_ERROR, // Block is in error BG_BLOCK_NAV, // Block state is undefined } bg_block_status_t; @@ -112,6 +111,9 @@ typedef enum { BG_JOB_ERROR //!< Job is in error status. } bg_job_status_t; +#define BG_BLOCK_ERROR_FLAG 0x1000 // Block is in error + + #define BG_SWITCH_NONE 0x0000 #define BG_SWITCH_OUT 0x0001 #define BG_SWITCH_IN 0x0002 diff --git a/src/plugins/select/bluegene/bg_job_place.c b/src/plugins/select/bluegene/bg_job_place.c index b06737879672efef2a926c472918a8b792f9ce80..ad80e635a44b299734a36d136d61ece9f918bb71 100644 --- a/src/plugins/select/bluegene/bg_job_place.c +++ b/src/plugins/select/bluegene/bg_job_place.c @@ -307,10 +307,10 @@ static bg_record_t *_find_matching_block(List block_list, if (bg_record->job_ptr) bg_record->job_running = bg_record->job_ptr->job_id; - /*block is messed up some how (BLOCK_ERROR_STATE) + /*block is messed up some how (BLOCK_ERROR_STATE_FLAG) * ignore it or if state == BG_BLOCK_ERROR */ if ((bg_record->job_running == BLOCK_ERROR_STATE) - || (bg_record->state == BG_BLOCK_ERROR)) { + || (bg_record->state & BG_BLOCK_ERROR_FLAG)) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("block %s is in an error " "state (can't use)", @@ -597,10 +597,12 @@ static int _check_for_booted_overlapping_blocks( || SELECT_IS_MODE_RUN_NOW(query_mode)) && (bg_conf->layout_mode != LAYOUT_DYNAMIC))) && ((found_record->job_running != NO_JOB_RUNNING) - || (found_record->state == BG_BLOCK_ERROR))) { + || (found_record->state + & BG_BLOCK_ERROR_FLAG))) { if ((found_record->job_running == BLOCK_ERROR_STATE) - || (found_record->state == BG_BLOCK_ERROR)) + || (found_record->state + & BG_BLOCK_ERROR_FLAG)) error("can't use %s, " "overlapping block %s " "is in an error state.", @@ -1026,7 +1028,7 @@ static int _find_best_block_match(List block_list, */ bg_record->job_running = BLOCK_ERROR_STATE; - bg_record->state = BG_BLOCK_ERROR; + bg_record->state |= BG_BLOCK_ERROR_FLAG; error("_find_best_block_match: Picked " "block (%s) had some issues with " "hardware, trying a different " diff --git a/src/plugins/select/bluegene/bg_node_info.c b/src/plugins/select/bluegene/bg_node_info.c index e0ac56af4c635aa848c468d4837635b01ce9da5c..c9462bfcf2c7ed21213c277629a89446a659f614 100644 --- a/src/plugins/select/bluegene/bg_node_info.c +++ b/src/plugins/select/bluegene/bg_node_info.c @@ -209,12 +209,12 @@ unpack_error: extern select_nodeinfo_t *select_nodeinfo_alloc(uint32_t size) { select_nodeinfo_t *nodeinfo = xmalloc(sizeof(struct select_nodeinfo)); - uint32_t cluster_flags = slurmdb_setup_cluster_flags(); + //uint32_t cluster_flags = slurmdb_setup_cluster_flags(); if (!g_bitmap_size) { - if (cluster_flags & CLUSTER_FLAG_BGQ) - g_bitmap_size = bg_conf->mp_cnode_cnt; - else + /* if (cluster_flags & CLUSTER_FLAG_BGQ) */ + /* g_bitmap_size = bg_conf->mp_cnode_cnt; */ + /* else */ g_bitmap_size = bg_conf->ionodes_per_mp; } @@ -249,15 +249,15 @@ extern int select_nodeinfo_set_all(time_t last_query_time) int i=0; bg_record_t *bg_record = NULL; static time_t last_set_all = 0; - uint32_t cluster_flags = slurmdb_setup_cluster_flags(); + //uint32_t cluster_flags = slurmdb_setup_cluster_flags(); if (!blocks_are_created) return SLURM_NO_CHANGE_IN_DATA; if (!g_bitmap_size) { - if (cluster_flags & CLUSTER_FLAG_BGQ) - g_bitmap_size = bg_conf->mp_cnode_cnt; - else + /* if (cluster_flags & CLUSTER_FLAG_BGQ) */ + /* g_bitmap_size = bg_conf->mp_cnode_cnt; */ + /* else */ g_bitmap_size = bg_conf->ionodes_per_mp; } @@ -297,7 +297,7 @@ extern int select_nodeinfo_set_all(time_t last_query_time) if (bg_record->job_running == NO_JOB_RUNNING) continue; - if (bg_record->state == BG_BLOCK_ERROR) + if (bg_record->state & BG_BLOCK_ERROR_FLAG) state = NODE_STATE_ERROR; else if (bg_record->job_running > NO_JOB_RUNNING) { /* we don't need to set the allocated here @@ -311,10 +311,10 @@ extern int select_nodeinfo_set_all(time_t last_query_time) bg_block_state_string(bg_record->state)); continue; } - if ((cluster_flags & CLUSTER_FLAG_BGQ) - && (state != NODE_STATE_ERROR)) - bitmap = bg_record->cnodes_used_bitmap; - else + /* if ((cluster_flags & CLUSTER_FLAG_BGQ) */ + /* && (state != NODE_STATE_ERROR)) */ + /* bitmap = bg_record->cnodes_used_bitmap; */ + /* else */ bitmap = bg_record->ionode_bitmap; for (i=0; i<node_record_count; i++) { @@ -331,11 +331,11 @@ extern int select_nodeinfo_set_all(time_t last_query_time) state, g_bitmap_size); if (subgrp->cnode_cnt < bg_conf->mp_cnode_cnt) { - if (cluster_flags & CLUSTER_FLAG_BGQ) { - bit_or(subgrp->bitmap, bitmap); - subgrp->cnode_cnt += - bit_set_count(bitmap); - } else if (bg_record->cnode_cnt + /* if (cluster_flags & CLUSTER_FLAG_BGQ) { */ + /* bit_or(subgrp->bitmap, bitmap); */ + /* subgrp->cnode_cnt += */ + /* bit_set_count(bitmap); */ + /* } else */ if (bg_record->cnode_cnt < bg_conf->mp_cnode_cnt) { bit_or(subgrp->bitmap, bitmap); subgrp->cnode_cnt += diff --git a/src/plugins/select/bluegene/bg_record_functions.c b/src/plugins/select/bluegene/bg_record_functions.c index df7e73262f650df775ac63a48f9b7fa6b61019f8..cc539d12aaa9fd75c2bf9416ded3d8f2ae3bf627 100644 --- a/src/plugins/select/bluegene/bg_record_functions.c +++ b/src/plugins/select/bluegene/bg_record_functions.c @@ -1088,7 +1088,7 @@ extern int down_nodecard(char *mp_name, bitoff_t io_start, */ if (smallest_bg_record && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){ - if (smallest_bg_record->state == BG_BLOCK_ERROR) { + if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } @@ -1157,7 +1157,7 @@ extern int down_nodecard(char *mp_name, bitoff_t io_start, } else if (smallest_bg_record) { debug2("smallest dynamic block is %s", smallest_bg_record->bg_block_id); - if (smallest_bg_record->state == BG_BLOCK_ERROR) { + if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } @@ -1167,7 +1167,8 @@ extern int down_nodecard(char *mp_name, bitoff_t io_start, if (smallest_bg_record->cnode_cnt == create_size) { rc = put_block_in_error_state( - smallest_bg_record, BLOCK_ERROR_STATE, reason); + smallest_bg_record, BLOCK_ERROR_STATE, + reason); goto cleanup; } @@ -1411,7 +1412,7 @@ extern int put_block_in_error_state(bg_record_t *bg_record, list_push(bg_lists->booted, bg_record); bg_record->job_running = state; - bg_record->state = BG_BLOCK_ERROR; + bg_record->state |= BG_BLOCK_ERROR_FLAG; xfree(bg_record->user_name); xfree(bg_record->target_name); @@ -1442,21 +1443,23 @@ extern int resume_block(bg_record_t *bg_record) if (bg_record->job_running > NO_JOB_RUNNING) return SLURM_SUCCESS; - if (bg_record->state == BG_BLOCK_ERROR) + if (bg_record->state & BG_BLOCK_ERROR_FLAG) { + bg_record->state &= (~BG_BLOCK_ERROR_FLAG); info("Block %s put back into service after " "being in an error state.", bg_record->bg_block_id); + } if (remove_from_bg_list(bg_lists->job_running, bg_record) == SLURM_SUCCESS) num_unused_cpus += bg_record->cpu_cnt; + if (bg_record->state != BG_BLOCK_INITED) remove_from_bg_list(bg_lists->booted, bg_record); + else if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) + list_push(bg_lists->booted, bg_record); bg_record->job_running = NO_JOB_RUNNING; -#ifndef HAVE_BG_FILES - bg_record->state = BG_BLOCK_FREE; -#endif xfree(bg_record->reason); last_bg_update = time(NULL); @@ -1548,7 +1551,7 @@ static int _check_all_blocks_error(int node_inx, time_t event_time, itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { /* only look at other nodes in error state */ - if (bg_record->state != BG_BLOCK_ERROR) + if (!(bg_record->state & BG_BLOCK_ERROR_FLAG)) continue; if (!bit_test(bg_record->bitmap, node_inx)) continue; diff --git a/src/plugins/select/bluegene/bg_status.c b/src/plugins/select/bluegene/bg_status.c index d3e8f965cd07968475c5190ff4b1ef6dd64c9fc4..d3b69ceb0f1f07ef4640f233e2d9a68a13c9fe36 100644 --- a/src/plugins/select/bluegene/bg_status.c +++ b/src/plugins/select/bluegene/bg_status.c @@ -119,15 +119,15 @@ static int _block_is_deallocating(bg_record_t *bg_record, List kill_job_list) } extern int bg_status_update_block_state(bg_record_t *bg_record, - bg_block_status_t state, + uint16_t state, List kill_job_list) { bool skipped_dealloc = false; kill_job_struct_t *freeit = NULL; int updated = 0; + uint16_t real_state = bg_record->state & (~BG_BLOCK_ERROR_FLAG); - if (bg_record->job_running == BLOCK_ERROR_STATE - || bg_record->state == state) + if (real_state == state) return 0; debug("state of Block %s was %s and now is %s", @@ -139,8 +139,8 @@ extern int bg_status_update_block_state(bg_record_t *bg_record, check to make sure block went through freeing correctly */ - if ((bg_record->state != BG_BLOCK_TERM - && bg_record->state != BG_BLOCK_ERROR) + if ((real_state != BG_BLOCK_TERM + && !(bg_record->state & BG_BLOCK_ERROR_FLAG)) && state == BG_BLOCK_FREE) skipped_dealloc = 1; else if ((bg_record->state == BG_BLOCK_INITED) @@ -166,7 +166,12 @@ extern int bg_status_update_block_state(bg_record_t *bg_record, state and act like this didn't happen. */ goto nochange_state; - bg_record->state = state; + if (bg_record->state & BG_BLOCK_ERROR_FLAG) + state |= BG_BLOCK_ERROR_FLAG; + else if (state & BG_BLOCK_ERROR_FLAG) + bg_record->state |= state; + else + bg_record->state = state; if (bg_record->state == BG_BLOCK_TERM || skipped_dealloc) _block_is_deallocating(bg_record, kill_job_list); @@ -179,7 +184,7 @@ extern int bg_status_update_block_state(bg_record_t *bg_record, num_unused_cpus += bg_record->cpu_cnt; remove_from_bg_list(bg_lists->booted, bg_record); - } else if (bg_record->state == BG_BLOCK_ERROR) { + } else if (bg_record->state & BG_BLOCK_ERROR_FLAG) { if (bg_record->boot_state) error("Block %s in an error state while booting.", bg_record->bg_block_id); @@ -199,7 +204,19 @@ nochange_state: debug3("boot state for block %s is %d", bg_record->bg_block_id, bg_record->boot_state); if (bg_record->boot_state) { - switch(bg_record->state) { + if (bg_record->state & BG_BLOCK_ERROR_FLAG) { + /* If we get an error on boot that + * means it is a transparent L3 error + * and should be trying to fix + * itself. If this is the case we + * just hang out waiting for the state + * to go to free where we will try to + * boot again below. + */ + return updated; + } + + switch (bg_record->state) { case BG_BLOCK_BOOTING: debug3("checking to make sure user %s " "is the user.", @@ -213,16 +230,6 @@ nochange_state: last_job_update = time(NULL); } break; - case BG_BLOCK_ERROR: - /* If we get an error on boot that - * means it is a transparent L3 error - * and should be trying to fix - * itself. If this is the case we - * just hang out waiting for the state - * to go to free where we will try to - * boot again below. - */ - break; case BG_BLOCK_FREE: if (bg_record->boot_count < RETRY_BOOT_COUNT) { boot_block(bg_record); diff --git a/src/plugins/select/bluegene/bg_status.h b/src/plugins/select/bluegene/bg_status.h index ea0b41d78371413c1dbdf3c5f8874a1cf7c84352..2e854152f57edad5ecaf3d7c591ecc0fcb90cac9 100644 --- a/src/plugins/select/bluegene/bg_status.h +++ b/src/plugins/select/bluegene/bg_status.h @@ -41,7 +41,7 @@ #include "bg_core.h" extern int bg_status_update_block_state(bg_record_t *bg_record, - bg_block_status_t state, + uint16_t state, List kill_job_list); extern List bg_status_create_kill_job_list(void); extern void bg_status_process_kill_job_list(List kill_job_list); diff --git a/src/plugins/select/bluegene/bl/bridge_linker.c b/src/plugins/select/bluegene/bl/bridge_linker.c index 9799ba07bdbcb4abf40441d65e78bd4497842024..a0a6b05cb49c197623308f367802679cfea3ffdd 100644 --- a/src/plugins/select/bluegene/bl/bridge_linker.c +++ b/src/plugins/select/bluegene/bl/bridge_linker.c @@ -2152,7 +2152,7 @@ extern status_t bridge_get_data(rm_element_t* element, *state = BG_BLOCK_TERM; break; case RM_PARTITION_ERROR: - *state = BG_BLOCK_ERROR; + *state |= BG_BLOCK_ERROR_FLAG; break; case RM_PARTITION_NAV: *state = BG_BLOCK_NAV; diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc b/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc index e61ac8cc7155baa301a743c2d39982777432848f..16e7f4816ac9237b19d5f1f2a9299935bf6c6447 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc @@ -250,8 +250,7 @@ extern int bridge_handle_runtime_errors(const char *function, return rc; } -extern bg_block_status_t bridge_translate_status( - bgsched::Block::Status state_in) +extern uint16_t bridge_translate_status(bgsched::Block::Status state_in) { switch (state_in) { case Block::Allocated: @@ -270,7 +269,7 @@ extern bg_block_status_t bridge_translate_status( return BG_BLOCK_TERM; break; default: - return BG_BLOCK_ERROR; + return BG_BLOCK_ERROR_FLAG; break; } error("unknown block state %d", state_in); diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index a4002347073fd5f196b15df95df6c6d4b3d9e242..1aadaffbee9066a606748836f544aa5579404de6 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -755,7 +755,7 @@ extern int select_p_state_save(char *dir_name) * the blocks in an error state */ #if defined HAVE_BG_FILES - if (bg_record->state != BG_BLOCK_ERROR) + if (!(bg_record->state & BG_BLOCK_ERROR_FLAG)) continue; #endif xassert(bg_record->bg_block_id != NULL); @@ -877,7 +877,7 @@ extern int select_p_state_restore(char *dir_name) no threads are started before this function. */ itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { - if (bg_record->state == BG_BLOCK_ERROR) + if (bg_record->state & BG_BLOCK_ERROR_FLAG) put_block_in_error_state(bg_record, BLOCK_ERROR_STATE, NULL); } @@ -1337,7 +1337,7 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr) bg_record->job_ptr = NULL; } - if (block_desc_ptr->state == BG_BLOCK_ERROR) { + if (block_desc_ptr->state == BG_BLOCK_ERROR_FLAG) { bg_record_t *found_record = NULL; ListIterator itr; List delete_list = list_create(NULL); @@ -1470,7 +1470,7 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr) to a normal state in accounting first */ itr = list_iterator_create(delete_list); while ((found_record = list_next(itr))) { - if (found_record->state == BG_BLOCK_ERROR) + if (found_record->state & BG_BLOCK_ERROR_FLAG) resume_block(found_record); } list_iterator_destroy(itr); @@ -1485,7 +1485,7 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr) /* make sure if we are removing a block to put it back to a normal state in accounting first */ - if (bg_record->state == BG_BLOCK_ERROR) + if (bg_record->state & BG_BLOCK_ERROR_FLAG) resume_block(bg_record); term_jobs_on_block(bg_record->bg_block_id); @@ -1658,7 +1658,7 @@ extern int select_p_update_sub_node (update_block_msg_t *block_desc_ptr) } node_name = xstrdup_printf("%s%s", bg_conf->slurm_node_prefix, coord); /* find out how many nodecards to get for each ionode */ - if (block_desc_ptr->state == BG_BLOCK_ERROR) { + if (block_desc_ptr->state == BG_BLOCK_ERROR_FLAG) { info("Admin setting %s[%s] in an error state", node_name, ionodes); for(i = 0; i<bg_conf->ionodes_per_mp; i++) { diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index 6c151350998aed1f333e559eee0c9ed8c954ac32..8e36d80d8d2c315a5dc45ee9765938c01c2b64c7 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -1474,7 +1474,7 @@ _update_bluegene_block (int argc, char *argv[]) block_msg.bg_block_id = val; } else if (!strncasecmp(tag, "State", MAX(tag_len, 2))) { if (!strncasecmp(val, "ERROR", MAX(vallen, 1))) - block_msg.state = BG_BLOCK_ERROR; + block_msg.state = BG_BLOCK_ERROR_FLAG; else if (!strncasecmp(val, "FREE", MAX(vallen, 1))) block_msg.state = BG_BLOCK_FREE; else if (!strncasecmp(val, "RECREATE", MAX(vallen, 3))) @@ -1560,7 +1560,7 @@ _update_bluegene_subbp (int argc, char *argv[]) block_msg.mp_str = val; else if (!strncasecmp(tag, "State", MAX(tag_len, 2))) { if (!strncasecmp(val, "ERROR", MAX(vallen, 1))) - block_msg.state = BG_BLOCK_ERROR; + block_msg.state = BG_BLOCK_ERROR_FLAG; else if (!strncasecmp(val, "FREE", MAX(vallen, 1))) block_msg.state = BG_BLOCK_FREE; else { diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 4b911fc532f14d38de8548cfd8d4b314d7879e05..c3a001b8a006ad701a0bc864390c93550b943587 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1560,7 +1560,7 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) #ifdef HAVE_BG if (block_desc.bg_block_id) { block_desc.reason = slurm_strerror(comp_msg->slurm_rc); - block_desc.state = BG_BLOCK_ERROR; + block_desc.state = BG_BLOCK_ERROR_FLAG; i = select_g_update_block(&block_desc); error_code = MAX(error_code, i); xfree(block_desc.bg_block_id); diff --git a/src/sview/block_info.c b/src/sview/block_info.c index b2fc2cbea9e7ce7a30c58cb92c62a54ad4cb8edc..b3e28e609f99606aba21842403fbeaf589a0c053 100644 --- a/src/sview/block_info.c +++ b/src/sview/block_info.c @@ -627,7 +627,7 @@ need_refresh: state */ enum node_states state = NODE_STATE_UNKNOWN; - if (block_ptr->state == BG_BLOCK_ERROR) + if (block_ptr->state & BG_BLOCK_ERROR_FLAG) state = NODE_STATE_ERROR; else if (block_ptr->job_running > NO_JOB_RUNNING) state = NODE_STATE_ALLOCATED; @@ -779,7 +779,7 @@ extern int update_state_block(GtkDialog *dialog, "Are you sure you want to put block %s " "in an error state?", blockid); - block_msg.state = BG_BLOCK_ERROR; + block_msg.state = BG_BLOCK_ERROR_FLAG; } else if (!strcasecmp("Recreate block", type)) { snprintf(tmp_char, sizeof(tmp_char), "Are you sure you want to recreate block %s?", @@ -1252,7 +1252,7 @@ display_it: } list_push(send_block_list, block_ptr); - if (block_ptr->state == BG_BLOCK_ERROR) + if (block_ptr->state & BG_BLOCK_ERROR_FLAG) state = NODE_STATE_ERROR; else if (block_ptr->job_running > NO_JOB_RUNNING) state = NODE_STATE_ALLOCATED; diff --git a/src/sview/grid.c b/src/sview/grid.c index fcc9fec30ce9d87fbad67fa7c73f9aa6bf4f02bd..750e9463b9f55bde0698b1e2efbbc62d53d753fd 100644 --- a/src/sview/grid.c +++ b/src/sview/grid.c @@ -1361,7 +1361,7 @@ extern void add_extra_bluegene_buttons(List *button_list, int inx, bg_info_ptr->ionode_str); mp_str = tmp_nodes; } - if (bg_info_ptr->state == BG_BLOCK_ERROR) + if (bg_info_ptr->state & BG_BLOCK_ERROR_FLAG) grid_button->state = NODE_STATE_ERROR; else if (bg_info_ptr->job_running > NO_JOB_RUNNING) grid_button->state = NODE_STATE_ALLOCATED; diff --git a/src/sview/popups.c b/src/sview/popups.c index 2e5717b93c2f1cad2dcad6b27fb841b7407d7f38..942752b93bd54ed53473bc542b12546d375f4162 100644 --- a/src/sview/popups.c +++ b/src/sview/popups.c @@ -759,7 +759,7 @@ extern void create_search_popup(GtkAction *action, gpointer user_data) {G_TYPE_NONE, BG_BLOCK_INITED, "Inited", TRUE, -1}, {G_TYPE_NONE, BG_BLOCK_ALLOCATED, NULL, TRUE, -1}, {G_TYPE_NONE, BG_BLOCK_TERM, "Terminating", TRUE, -1}, - {G_TYPE_NONE, BG_BLOCK_ERROR, "Error", TRUE, -1}, + {G_TYPE_NONE, BG_BLOCK_ERROR_FLAG, "Error", TRUE, -1}, {G_TYPE_NONE, -1, NULL, FALSE, -1} }; display_data_t *display_data = pulldown_display_data;