diff --git a/src/common/node_select.h b/src/common/node_select.h index 23722edaddf1dff690df41c7ca1abe33ffd1afe4..0449c37310e186d433ec1e248fc0fd35070f0a48 100644 --- a/src/common/node_select.h +++ b/src/common/node_select.h @@ -52,6 +52,7 @@ /* NO_JOB_RUNNING is used by select/blugene, select/bgq, smap and sview */ #define NO_JOB_RUNNING -1 +#define NOT_FROM_CONTROLLER -2 #define PASS_DENY_A 0x0001 #define PASS_DENY_X 0x0002 diff --git a/src/plugins/select/bluegene/ba/wire_test.c b/src/plugins/select/bluegene/ba/wire_test.c index e5e29b0ae12c97d887ddbc03ee226d6793667cfc..0369c5bec1b239461bfc077bed2a8c79c94ef486 100644 --- a/src/plugins/select/bluegene/ba/wire_test.c +++ b/src/plugins/select/bluegene/ba/wire_test.c @@ -58,7 +58,7 @@ bg_config_t *bg_conf = NULL; bg_lists_t *bg_lists = NULL; pthread_mutex_t block_state_mutex = PTHREAD_MUTEX_INITIALIZER; int blocks_are_created = 0; -int bg_recover = -2; +int bg_recover = NOT_FROM_CONTROLLER; extern List bg_status_create_kill_job_list(void) { diff --git a/src/plugins/select/bluegene/ba_bgq/wire_test.c b/src/plugins/select/bluegene/ba_bgq/wire_test.c index 46f28ef5a583bf1c375b862f17a224ac0aa9b20d..29fc06a0965e6393aea01e8d3503957bba6bb7b0 100644 --- a/src/plugins/select/bluegene/ba_bgq/wire_test.c +++ b/src/plugins/select/bluegene/ba_bgq/wire_test.c @@ -57,6 +57,7 @@ time_t last_bg_update; bg_config_t *bg_conf; bg_lists_t *bg_lists; pthread_mutex_t block_state_mutex = PTHREAD_MUTEX_INITIALIZER; +int bg_recover = NOT_FROM_CONTROLLER; extern int bg_status_update_block_state(bg_record_t *bg_record, bg_block_status_t state, diff --git a/src/plugins/select/bluegene/ba_common.c b/src/plugins/select/bluegene/ba_common.c index 78d6f894b8f18330441ab38fa9a040636abd55fb..4e57d41ba96718d29c8b65a3fd5ef691ecd6bce9 100644 --- a/src/plugins/select/bluegene/ba_common.c +++ b/src/plugins/select/bluegene/ba_common.c @@ -153,8 +153,8 @@ extern void ba_init(node_info_msg_t *node_info_ptr, bool sanity_check) cluster_dims = slurmdb_setup_cluster_dims(); cluster_flags = slurmdb_setup_cluster_flags(); set_ba_debug_flags(slurm_get_debug_flags()); - - bridge_init(""); + if (bg_recover != NOT_FROM_CONTROLLER) + bridge_init(""); memset(coords, 0, sizeof(coords)); memset(dims, 0, sizeof(dims)); @@ -311,7 +311,7 @@ node_info_error: /* sanity check. We can only request part of the system, but we don't want to allow more than we have. */ - if (sanity_check) { + if (sanity_check && (bg_recover != NOT_FROM_CONTROLLER)) { verbose("Attempting to contact MMCS"); if (bridge_get_size(real_dims) == SLURM_SUCCESS) { char real_dim_str[cluster_dims+1]; @@ -354,7 +354,8 @@ setup_done: ba_create_system(num_cpus, real_dims, dims); - bridge_setup_system(); + if (bg_recover != NOT_FROM_CONTROLLER) + bridge_setup_system(); ba_initialized = true; init_grid(node_info_ptr); @@ -370,7 +371,8 @@ extern void ba_fini() return; } - bridge_fini(); + if (bg_recover != NOT_FROM_CONTROLLER) + bridge_fini(); ba_destroy_system(); ba_initialized = false; diff --git a/src/plugins/select/bluegene/bg_core.h b/src/plugins/select/bluegene/bg_core.h index dce916523741461202a3f942a78a0c3bada1f925..cbca517cea8e776fdba42597b73f11d15048da23 100644 --- a/src/plugins/select/bluegene/bg_core.h +++ b/src/plugins/select/bluegene/bg_core.h @@ -43,13 +43,6 @@ #include "bg_enums.h" #include "bg_structs.h" #include "bg_record_functions.h" - -/* Change BLOCK_STATE_VERSION value when changing the state save - * format i.e. pack_block() */ -#define BLOCK_STATE_VERSION "VER005" -#define BLOCK_2_2_STATE_VERSION "VER004" /*Slurm 2.2's version*/ -#define BLOCK_2_1_STATE_VERSION "VER003" /*Slurm 2.1's version*/ - #include "bg_job_place.h" #include "bg_job_run.h" #include "bg_job_info.h" @@ -58,6 +51,12 @@ #include "bridge_linker.h" #include "bg_status.h" +/* Change BLOCK_STATE_VERSION value when changing the state save + * format i.e. pack_block() */ +#define BLOCK_STATE_VERSION "VER005" +#define BLOCK_2_2_STATE_VERSION "VER004" /*Slurm 2.2's version*/ +#define BLOCK_2_1_STATE_VERSION "VER003" /*Slurm 2.1's version*/ + /* Global variables */ /* extern bg_config_t *bg_conf; */ /* extern bg_lists_t *bg_lists; */ diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc b/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc index fcf96fc51c90475d8379b93cc053e0c7992374dc..13fac4bd09668e1550d5fd05e9edf8370afe8901 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc @@ -46,7 +46,11 @@ extern "C" { #if defined HAVE_BG_FILES +#include <bgsched/DatabaseException.h> +#include <bgsched/InitializationException.h> #include <bgsched/InputException.h> +#include <bgsched/InternalException.h> +#include <bgsched/RuntimeException.h> #include <bgsched/bgsched.h> #include <bgsched/Block.h> #include <bgsched/core/core.h> @@ -127,7 +131,6 @@ static int _block_wait_for_jobs(char *bg_block_id) std::vector<Job::ConstPtr> job_vec; JobFilter job_filter; JobFilter::Statuses job_statuses; - int count = 0; #endif if (!bridge_init(NULL)) @@ -151,9 +154,6 @@ static int _block_wait_for_jobs(char *bg_block_id) job_filter.setStatuses(&job_statuses); while (1) { - if (count) - sleep(POLL_INTERVAL); - count++; job_vec = getJobs(job_filter); if (job_vec.empty()) return SLURM_SUCCESS; @@ -162,6 +162,7 @@ static int _block_wait_for_jobs(char *bg_block_id) debug("waiting on job %lu to finish on block %s", job_ptr->getId(), bg_block_id); } + sleep(POLL_INTERVAL); } #endif return SLURM_SUCCESS; @@ -216,6 +217,9 @@ extern int bridge_init(char *properties_file) if (initialized) return 1; + if (bg_recover == NOT_FROM_CONTROLLER) + return 0; + #if defined HAVE_BG_FILES bgsched::init(properties_file); #endif @@ -228,7 +232,8 @@ extern int bridge_init(char *properties_file) extern int bridge_fini() { initialized = false; - bridge_status_fini(); + if (bg_recover != NOT_FROM_CONTROLLER) + bridge_status_fini(); return SLURM_SUCCESS; } @@ -365,30 +370,61 @@ extern int bridge_block_create(bg_record_t *bg_record) } try { block_ptr = Block::create(midplanes, pt_midplanes, conn_type); - } catch (bgsched::InputException err) { - // switch(err.getError()) { - // case bgsched::InputErrors::InvalidMidplanes: - // fatal("Couldn't create block, failing."); - // break; - // default: - fatal("unknown"); - // } + } catch (const bgsched::InputException& err) { + switch (err.getError().toValue()) { + case bgsched::InputErrors::InvalidMidplanes: + error("Invalid midplanes given for Block::Create()"); + break; + case bgsched::InputErrors::InvalidConnectivity: + error("Invalid connectivity given for Block::Create()"); + break; + case bgsched::InputErrors::BlockNotCreated: + error("Block::Create() can not create block from " + "input arguments"); + break; + default: + error("Unexpected exception value from " + "Block::Create()"); + } rc = SLURM_ERROR; } - block_ptr->setName(bg_record->bg_block_id); - block_ptr->setMicroLoaderImage(bg_record->mloaderimage); + block_ptr->setName(bg_record->bg_block_id); + block_ptr->setMicroLoaderImage(bg_record->mloaderimage); + try { block_ptr->add(""); // block_ptr->addUser(bg_record->bg_block_id, // bg_record->user_name); //info("got past add"); + } catch (const bgsched::InputException& err) { + switch (err.getError().toValue()) { + case bgsched::InputErrors::BlockNotAdded: + error("For some reason the block was not added."); + break; + case bgsched::InputErrors::InvalidMidplanes: + error("Invalid midplanes given for Block::Add()"); + break; + default: + error("Unexpected exception value from " + "Block::Add() %d", err.getError().toValue()); + } + rc = SLURM_ERROR; + } catch (const bgsched::RuntimeException& err2) { + switch (err2.getError().toValue()) { + case bgsched::RuntimeErrors::BlockAddError: + error("Error Setting block owner Block:Add()."); + break; + default: + error("2 Unexpected exception value from " + "Block::Add() %d", err2.getError().toValue()); + } + rc = SLURM_ERROR; } catch (...) { - fatal("Couldn't create block, failing."); + error("Unknown error from Block::Add()."); rc = SLURM_ERROR; } - #endif return rc; diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc index c876282b972b98ad2986a39bc421fdafa9766d70..44bd7ddd7d700c07bd3be765c475ea988410594d 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc @@ -62,7 +62,7 @@ extern "C" { using namespace std; using namespace bgsched; using namespace bgsched::core; - +using namespace bgsched::realtime; #endif static bool bridge_status_inited = false; @@ -78,7 +78,31 @@ public: * Handle a block state changed real-time event. */ void handleBlockStateChangedRealtimeEvent( - const BlockStateChangedEventInfo& eventInfo); + const BlockStateChangedEventInfo& info); + // /* + // * Handle a midplane state changed real-time event. + // */ + // virtual void handleMidplaneStateChangedRealtimeEvent( + // const MidplaneStateChangedEventInfo& info); + + // /* + // * Handle a switch state changed real-time event. + // */ + // virtual void handleSwitchStateChangedRealtimeEvent( + // const SwitchStateChangedEventInfo& info); + + // /* + // * Handle a node board state changed real-time event. + // */ + // virtual void handleNodeBoardStateChangedRealtimeEvent( + // const NodeBoardStateChangedEventInfo& info); + + // /* + // * Handle a cable state changed real-time event. + // */ + // virtual void handleCableStateChangedRealtimeEvent( + // const CableStateChangedEventInfo& info); + } event_handler_t; static List kill_job_list = NULL; @@ -154,7 +178,6 @@ static int _real_time_connect(void) try { rt_client_ptr->connect(); rc = SLURM_SUCCESS; - count = 0; } catch (...) { rc = SLURM_ERROR; error("couldn't connect to the real_time server, " @@ -170,41 +193,54 @@ static int _real_time_connect(void) static void *_real_time(void *no_data) { event_handler_t event_hand; - + int rc = SLURM_SUCCESS; bool failed = false; - bgsched::realtime::Filter rt_filter( - bgsched::realtime::Filter::createNone()); + Filter::BlockStatuses block_statuses; + Filter rt_filter(Filter::createNone()); rt_filter.setBlocks(true); - //rt_filter.setBlockDeleted(true); - // filter.get().setMidplanes(true); - // filter.get().setNodeBoards(true); - // filter.get().setSwitches(true); - // filter.get().setCables(true); + block_statuses.insert(Block::Free); + block_statuses.insert(Block::Booting); + block_statuses.insert(Block::Initialized); + block_statuses.insert(Block::Terminating); + rt_filter.setBlockStatuses(&block_statuses); + + // rt_filter.get().setMidplanes(true); + // rt_filter.get().setNodeBoards(true); + // rt_filter.get().setSwitches(true); + // rt_filter.get().setCables(true); rt_client_ptr->addListener(event_hand); - _real_time_connect(); + rc = _real_time_connect(); - while (bridge_status_inited && !failed) { + while (bridge_status_inited) { bgsched::realtime::Filter::Id filter_id; // Assigned filter id slurm_mutex_lock(&rt_mutex); - if (!bridge_status_inited) + if (bridge_status_inited) { + slurm_mutex_unlock(&rt_mutex); break; + } - rt_client_ptr->setFilter(rt_filter, &filter_id, NULL); + if (rc == SLURM_SUCCESS) { + rt_client_ptr->setFilter(rt_filter, &filter_id, NULL); + + rt_client_ptr->requestUpdates(NULL); + rt_client_ptr->receiveMessages(NULL, NULL, &failed); + } else + failed = true; - rt_client_ptr->requestUpdates(NULL); - rt_client_ptr->receiveMessages(NULL, NULL, &failed); slurm_mutex_unlock(&rt_mutex); if (bridge_status_inited && failed) { error("Disconnected from real-time events. " - "Will try to reconnect."); - _real_time_connect(); - info("real-time server connected again"); - failed = false; + "Will try tCopy of SP2o reconnect."); + rc = _real_time_connect(); + if (rc == SLURM_SUCCESS) { + info("real-time server connected again"); + failed = false; + } } } return NULL; @@ -217,8 +253,10 @@ static void *_poll(void *no_data) while (bridge_status_inited) { //debug("polling waiting until realtime dies"); slurm_mutex_lock(&rt_mutex); - if (!bridge_status_inited) + if (!bridge_status_inited) { + slurm_mutex_unlock(&rt_mutex); break; + } //debug("polling taking over, realtime is dead"); bridge_status_do_poll(); slurm_mutex_unlock(&rt_mutex); diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index 4a1138680ee32fe46bea9636a710294025fc4a3e..8bfd8f764cd7dc6465b2a3d36437a7bfdf0cb5b5 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -52,12 +52,8 @@ #include "src/slurmctld/trigger_mgr.h" #include <fcntl.h> -#define MMCS_POLL_TIME 30 /* seconds between poll of MMCS for - * down switches and nodes */ -#define BG_POLL_TIME 1 /* seconds between poll of state - * change in bg blocks */ #define HUGE_BUF_SIZE (1024*16) -#define NOT_FROM_CONTROLLER -2 + /* These are defined here so when we link with something other than * the slurmctld we will have these symbols defined. They will get * overwritten when linking with the slurmctld. @@ -606,7 +602,7 @@ static List _get_config(void) * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ -extern int init ( void ) +extern int init(void) { #ifdef HAVE_BG diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index 1a22007780dd3cccef89f6772c1b154ecf8056ba..c01ae7878c63b945317facda03ad7c27e9e910a0 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -55,7 +55,6 @@ #include "other_select.h" #include "basil_interface.h" -#define NOT_FROM_CONTROLLER -2 /* These are defined here so when we link with something other than * the slurmctld we will have these symbols defined. They will get * overwritten when linking with the slurmctld.