diff --git a/src/plugins/select/bluegene/bg_core.c b/src/plugins/select/bluegene/bg_core.c index 425a7568f20f2591dca96e2afd577aa6b33f39e0..58c9d538cf4bbdc412ed21600afa86ce23fd8502 100644 --- a/src/plugins/select/bluegene/bg_core.c +++ b/src/plugins/select/bluegene/bg_core.c @@ -88,6 +88,10 @@ static int _post_block_free(bg_record_t *bg_record, bool restore) bg_record->bg_block_id); xassert(0); return SLURM_SUCCESS; + } else if (bg_record->modifying) { + info("%d other are modifing this block %s", + bg_record->free_cnt, bg_record->bg_block_id); + return SLURM_SUCCESS; } else if (bg_record->free_cnt) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%d other are trying to destroy this block %s", @@ -141,19 +145,18 @@ static int _post_block_free(bg_record_t *bg_record, bool restore) rc = bridge_block_remove(bg_record); if (rc != SLURM_SUCCESS) { - /* if (rc == PARTITION_NOT_FOUND) { */ - /* debug("_post_block_free: block %s is not found", */ - /* bg_record->bg_block_id); */ - /* } else { */ + if (rc == BG_ERROR_BLOCK_NOT_FOUND) { + debug("_post_block_free: block %s is not found", + bg_record->bg_block_id); + } else { error("_post_block_free: " "bridge_block_remove(%s): %s", bg_record->bg_block_id, bg_err_str(rc)); - /* } */ - } else - if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) - info("_post_block_free: done %s(%p)", - bg_record->bg_block_id, bg_record); + } + } else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) + info("_post_block_free: done %s(%p)", + bg_record->bg_block_id, bg_record); destroy_bg_record(bg_record); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) @@ -365,7 +368,7 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) debug("block %s is not found", bg_record->bg_block_id); break; - } else if (rc == BG_ERROR_PENDING_ACTION) { + } else if (rc == BG_ERROR_INVALID_STATE) { #ifndef HAVE_BGL /* If the state is error and we get an incompatible @@ -501,8 +504,6 @@ extern int free_block_list(uint32_t job_id, List track_list, if (remove_from_bg_list(bg_lists->job_running, bg_record) == SLURM_SUCCESS) num_unused_cpus += bg_record->cpu_cnt; - - bg_free_block(bg_record, 0, 1); } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); @@ -876,8 +877,6 @@ extern const char *bg_err_str(int inx) switch (inx) { case SLURM_SUCCESS: return "Status OK"; - case BG_ERROR_PENDING_ACTION: - return "Action already pending"; case BG_ERROR_BLOCK_NOT_FOUND: return "Block not found"; case BG_ERROR_BOOT_ERROR: diff --git a/src/plugins/select/bluegene/bg_enums.h b/src/plugins/select/bluegene/bg_enums.h index 330032b643328ae29391b645fc0e6161973d2880..1013be975d37c1b5db5e3913b3aee89417030a18 100644 --- a/src/plugins/select/bluegene/bg_enums.h +++ b/src/plugins/select/bluegene/bg_enums.h @@ -154,8 +154,7 @@ typedef enum { #define REMOVE_USER_FOUND 2 typedef enum { - BG_ERROR_PENDING_ACTION = 100, - BG_ERROR_INVALID_STATE, + BG_ERROR_INVALID_STATE = 100, BG_ERROR_BLOCK_NOT_FOUND, BG_ERROR_BOOT_ERROR, BG_ERROR_JOB_NOT_FOUND, diff --git a/src/plugins/select/bluegene/bg_job_run.c b/src/plugins/select/bluegene/bg_job_run.c index 126496a9af31c53fed922d91c1089cd1946c5cdd..c015d43aff0db0e28216fca6f87de165f24bdc9f 100644 --- a/src/plugins/select/bluegene/bg_job_run.c +++ b/src/plugins/select/bluegene/bg_job_run.c @@ -229,17 +229,15 @@ static void _start_agent(bg_action_t *bg_action_ptr) } if (bg_record->state == BG_BLOCK_TERM) { debug("Block is in Deallocating state, waiting for free."); + /* Increment free_cnt to make sure we don't loose this + * block since bg_free_block will unlock block_state_mutex. + */ + bg_record->free_cnt++; bg_free_block(bg_record, 1, 1); + bg_record->free_cnt--; /* no reason to reboot here since we are already deallocating */ bg_action_ptr->reboot = 0; - /* Since bg_free_block will unlock block_state_mutex - we need to make sure the block we want is still - around. Failure will unlock this so no need to - unlock before return. - */ - if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) - return; } delete_list = list_create(NULL); @@ -301,7 +299,9 @@ static void _start_agent(bg_action_t *bg_action_ptr) slurm_mutex_lock(&block_state_mutex); /* Failure will unlock block_state_mutex so no need to unlock before - return. */ + return. Failure will unlock block_state_mutex so no need to unlock + before return. + */ if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) return; @@ -381,15 +381,12 @@ static void _start_agent(bg_action_t *bg_action_ptr) if (rc) { bg_record->modifying = 1; + /* Increment free_cnt to make sure we don't loose this + * block since bg_free_block will unlock block_state_mutex. + */ + bg_record->free_cnt++; bg_free_block(bg_record, 1, 1); - - /* Since bg_free_block will unlock block_state_mutex - we need to make sure the block we want is still - around. Failure will unlock block_state_mutex so - no need to unlock before return. - */ - if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) - return; + bg_record->free_cnt--; #if defined HAVE_BG_FILES && defined HAVE_BG_L_P #ifdef HAVE_BGL @@ -469,15 +466,12 @@ static void _start_agent(bg_action_t *bg_action_ptr) } else if (bg_action_ptr->reboot) { bg_record->modifying = 1; + /* Increment free_cnt to make sure we don't loose this + * block since bg_free_block will unlock block_state_mutex. + */ + bg_record->free_cnt++; bg_free_block(bg_record, 1, 1); - - /* Since bg_free_block will unlock block_state_mutex - we need to make sure the block we want is still - around. Failure will unlock block_state_mutex so - no need to unlock before return. - */ - if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) - return; + bg_record->free_cnt--; bg_record->modifying = 0; } @@ -952,22 +946,22 @@ extern int boot_block(bg_record_t *bg_record) info("Booting block %s", bg_record->bg_block_id); if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) { - /* error("bridge_create_block(%s): %s", */ - /* bg_record->bg_block_id, bg_err_str(rc)); */ - /* if (rc == INCOMPATIBLE_STATE) { */ - /* char reason[200]; */ - /* snprintf(reason, sizeof(reason), */ - /* "boot_block: " */ - /* "Block %s is in an incompatible state. " */ - /* "This usually means hardware is allocated " */ - /* "by another block (maybe outside of SLURM).", */ - /* bg_record->bg_block_id); */ - /* bg_record->boot_state = 0; */ - /* bg_record->boot_count = 0; */ - /* slurm_mutex_unlock(&block_state_mutex); */ - /* requeue_and_error(bg_record, reason); */ - /* slurm_mutex_lock(&block_state_mutex); */ - /* } */ + error("bridge_create_block(%s): %s", + bg_record->bg_block_id, bg_err_str(rc)); + if (rc == BG_ERROR_BOOT_ERROR) { + char reason[200]; + snprintf(reason, sizeof(reason), + "boot_block: " + "Block %s is in an incompatible state. " + "This usually means hardware is allocated " + "by another block (maybe outside of SLURM).", + bg_record->bg_block_id); + bg_record->boot_state = 0; + bg_record->boot_count = 0; + slurm_mutex_unlock(&block_state_mutex); + requeue_and_error(bg_record, reason); + slurm_mutex_lock(&block_state_mutex); + } return SLURM_ERROR; } diff --git a/src/plugins/select/bluegene/bl/bridge_linker.c b/src/plugins/select/bluegene/bl/bridge_linker.c index a0a6b05cb49c197623308f367802679cfea3ffdd..4cf1d90e1edba612cb217628c8e1e7e1cf4ca434 100644 --- a/src/plugins/select/bluegene/bl/bridge_linker.c +++ b/src/plugins/select/bluegene/bl/bridge_linker.c @@ -143,7 +143,7 @@ static int _bg_errtrans(int in) case PARTITION_NOT_FOUND: return BG_ERROR_BLOCK_NOT_FOUND; case INCOMPATIBLE_STATE: - return BG_ERROR_PENDING_ACTION; + return BG_ERROR_INVALID_STATE; case CONNECTION_ERROR: return BG_ERROR_CONNECTION_ERROR; case JOB_NOT_FOUND: @@ -380,7 +380,7 @@ static int _remove_job(db_job_id_t job_id, char *block_id) job_id, block_id); return SLURM_SUCCESS; } - if (rc == BG_ERROR_PENDING_ACTION) + if (rc == BG_ERROR_INVALID_STATE) debug("job %d on block %s is in an " "INCOMPATIBLE_STATE", job_id, block_id); @@ -1218,6 +1218,9 @@ extern int bridge_block_boot(bg_record_t *bg_record) slurm_mutex_lock(&api_file_mutex); rc = _bg_errtrans((*(bridge_api.create_partition)) (bg_record->bg_block_id)); + if (rc == BG_ERROR_INVALID_STATE) + rc = BG_ERROR_BOOT_ERROR; + slurm_mutex_unlock(&api_file_mutex); return rc; #else diff --git a/src/plugins/select/bluegene/bl/bridge_status.c b/src/plugins/select/bluegene/bl/bridge_status.c index 23b2dd26d84fbc30f3f6bfc828bd43a9b610db28..e9b9c67ea7e7452a8bc9dd370d51f88a1db5777b 100644 --- a/src/plugins/select/bluegene/bl/bridge_status.c +++ b/src/plugins/select/bluegene/bl/bridge_status.c @@ -845,12 +845,12 @@ extern int bridge_status_update_block_list_state(List block_list) != SLURM_SUCCESS) { if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { switch(rc) { - case INCONSISTENT_DATA: + case BG_ERROR_INVALID_STATE: debug2("got inconsistent data when " "querying block %s", name); continue; break; - case PARTITION_NOT_FOUND: + case BG_ERROR_BLOCK_NOT_FOUND: debug("block %s not found, removing " "from slurm", name); /* Just set to free, diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc b/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc index 4a4196c33f256ad5d9d76fa1799b13cd61408c78..74f65972a2f7172fa56ccd53651e0532817bffa7 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc @@ -222,10 +222,11 @@ extern int bridge_handle_runtime_errors(const char *function, case bgsched::RuntimeErrors::BlockBootError: error("%s: Error booting block %s.", function, bg_record->bg_block_id); + rc = BG_ERROR_BOOT_ERROR; break; case bgsched::RuntimeErrors::BlockFreeError: /* not a real error */ - rc = BG_ERROR_PENDING_ACTION; + rc = BG_ERROR_INVALID_STATE; debug2("%s: Error freeing block %s.", function, bg_record->bg_block_id); break; diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc b/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc index 64d2b26dce7b2ddbca4283dd91c8715ab559de0d..50772f32bdb558eb01284fee7d8b7442d7a60488 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc @@ -601,8 +601,8 @@ extern int bridge_block_remove(bg_record_t *bg_record) Block::remove(bg_record->bg_block_id); } catch (const bgsched::RuntimeException& err) { rc = bridge_handle_runtime_errors("Block::remove", - err.getError().toValue(), - bg_record); + err.getError().toValue(), + bg_record); if (rc != SLURM_SUCCESS) return rc; } catch (const bgsched::DatabaseException& err) { diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index e4bea7c4c83f08539c11ec173621678b38ec9938..a71413b509abecd99f33ea5d79dd2129e2b2753a 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -1415,7 +1415,12 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr) list_destroy(delete_list); put_block_in_error_state(bg_record, BLOCK_ERROR_STATE, reason); } else if (block_desc_ptr->state == BG_BLOCK_FREE) { + /* Increment free_cnt to make sure we don't loose this + * block since bg_free_block will unlock block_state_mutex. + */ + bg_record->free_cnt++; bg_free_block(bg_record, 0, 1); + bg_record->free_cnt--; resume_block(bg_record); slurm_mutex_unlock(&block_state_mutex); } else if (block_desc_ptr->state == BG_BLOCK_TERM) { @@ -1512,7 +1517,12 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr) if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("select_p_update_block: " "freeing the block %s.", bg_record->bg_block_id); + /* Increment free_cnt to make sure we don't loose this + * block since bg_free_block will unlock block_state_mutex. + */ + bg_record->free_cnt++; bg_free_block(bg_record, 1, 1); + bg_record->free_cnt--; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("select_p_update_block: done");