From 5fa28b7f7653ba812b877f0fc3e2b5e6f46444aa Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Wed, 26 Oct 2011 15:13:28 -0700 Subject: [PATCH] BGQ - better error checking for when API throws errors --- .../select/bluegene/bl_bgq/bridge_helper.cc | 93 ++++++++- .../select/bluegene/bl_bgq/bridge_helper.h | 7 + .../select/bluegene/bl_bgq/bridge_linker.cc | 190 +++++++++++------- .../select/bluegene/bl_bgq/bridge_status.cc | 110 +++++++--- 4 files changed, 292 insertions(+), 108 deletions(-) diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc b/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc index 1cbc3984a5d..09345356cb7 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc @@ -487,7 +487,7 @@ extern Block::Ptrs bridge_get_blocks(BlockFilter filter) Block::Ptrs vec; try { - vec = getBlocks(filter, BlockSort::AnyOrder); + vec = getBlocks(filter); } catch (const bgsched::DatabaseException& err) { bridge_handle_database_errors("getBlocks", err.getError().toValue()); @@ -505,4 +505,95 @@ extern Block::Ptrs bridge_get_blocks(BlockFilter filter) return vec; } +extern Midplane::ConstPtr bridge_get_midplane(ComputeHardware::ConstPtr bgqsys, + ba_mp_t *ba_mp) +{ + Midplane::ConstPtr mp_ptr; + + assert(ba_mp); + + try { + Coordinates::Coordinates coords( + ba_mp->coord[0], ba_mp->coord[1], + ba_mp->coord[2], ba_mp->coord[3]); + mp_ptr = bgqsys->getMidplane(coords); + } catch (const bgsched::InputException& err) { + bridge_handle_input_errors( + "ComputeHardware::getMidplane", + err.getError().toValue(), NULL); + } catch (...) { + error("Unknown error from ComputeHardware::getMidplane."); + } + return mp_ptr; +} + +extern Node::ConstPtrs bridge_get_midplane_nodes(const std::string& loc) +{ + Node::ConstPtrs vec; + + try { + vec = getMidplaneNodes(loc); + } catch (const bgsched::DatabaseException& err) { + bridge_handle_database_errors("getMidplaneNodes", + err.getError().toValue()); + } catch (const bgsched::InputException& err) { + bridge_handle_input_errors("getMidplaneNodes", + err.getError().toValue(), + NULL); + } catch (const bgsched::InternalException& err) { + bridge_handle_internal_errors("getMidplaneNodes", + err.getError().toValue()); + } catch (...) { + error("Unknown error from getMidplaneNodes."); + } + return vec; +} + +extern NodeBoard::ConstPtr bridge_get_nodeboard(Midplane::ConstPtr mp_ptr, + int nodeboard_num) +{ + NodeBoard::ConstPtr nb_ptr; + + try { + nb_ptr = mp_ptr->getNodeBoard(nodeboard_num); + } catch (const bgsched::InputException& err) { + bridge_handle_input_errors("Midplane::getNodeBoard", + err.getError().toValue(), + NULL); + } catch (...) { + error("Unknown error from Midplane::getNodeBoard."); + } + return nb_ptr; +} + +extern Switch::ConstPtr bridge_get_switch(Midplane::ConstPtr mp_ptr, int dim) +{ + Switch::ConstPtr switch_ptr; + + try { + switch_ptr = mp_ptr->getSwitch(dim); + } catch (const bgsched::InputException& err) { + bridge_handle_input_errors("Midplane::getSwitch", + err.getError().toValue(), + NULL); + } catch (...) { + error("Unknown error from Midplane::getSwitch."); + } + return switch_ptr; +} + +extern ComputeHardware::ConstPtr bridge_get_compute_hardware() +{ + ComputeHardware::ConstPtr bgqsys; + + try { + bgqsys = getComputeHardware(); + } catch (const bgsched::InternalException& err) { + bridge_handle_internal_errors("getComputeHardware", + err.getError().toValue()); + } catch (...) { + error("Unknown error from getComputeHardware"); + } + return bgqsys; +} #endif diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_helper.h b/src/plugins/select/bluegene/bl_bgq/bridge_helper.h index 750acce913e..0889a9733ff 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_helper.h +++ b/src/plugins/select/bluegene/bl_bgq/bridge_helper.h @@ -103,6 +103,13 @@ extern const char *bridge_hardware_state_string(const int state); /* helper functions */ extern Block::Ptrs bridge_get_blocks(BlockFilter filter); +extern Midplane::ConstPtr bridge_get_midplane(ComputeHardware::ConstPtr bgqsys, + ba_mp_t *ba_mp); +extern Node::ConstPtrs bridge_get_midplane_nodes(const std::string& loc); +extern NodeBoard::ConstPtr bridge_get_nodeboard(Midplane::ConstPtr mp_ptr, + int nodeboard_num); +extern Switch::ConstPtr bridge_get_switch(Midplane::ConstPtr mp_ptr, int dim); +extern ComputeHardware::ConstPtr bridge_get_compute_hardware(); #endif diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc b/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc index b39d4cead4f..fa8fcaa538d 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc @@ -76,6 +76,9 @@ static void _setup_ba_mp(int level, uint16_t *coords, Midplane::ConstPtr mp_ptr; int i; + if (!bgqsys) + fatal("_setup_ba_mp: No ComputeHardware ptr"); + if (level > SYSTEM_DIMENSIONS) return; @@ -89,38 +92,19 @@ static void _setup_ba_mp(int level, uint16_t *coords, return; } - if (!(ba_mp = coord2ba_mp(coords))) + if (!(ba_mp = coord2ba_mp(coords)) + || !(mp_ptr = bridge_get_midplane(bgqsys, ba_mp))) return; - try { - Coordinates::Coordinates coords( - ba_mp->coord[A], ba_mp->coord[X], - ba_mp->coord[Y], ba_mp->coord[Z]); - mp_ptr = bgqsys->getMidplane(coords); - } catch (const bgsched::InputException& err) { - int rc = bridge_handle_input_errors( - "ComputeHardware::getMidplane", - err.getError().toValue(), NULL); - if (rc != SLURM_SUCCESS) - return; - } - ba_mp->loc = xstrdup(mp_ptr->getLocation().c_str()); ba_mp->nodecard_loc = (char **)xmalloc(sizeof(char *) * bg_conf->mp_nodecard_cnt); for (i=0; i<bg_conf->mp_nodecard_cnt; i++) { - try { - NodeBoard::ConstPtr nodeboard = mp_ptr->getNodeBoard(i); + NodeBoard::ConstPtr nb_ptr = bridge_get_nodeboard(mp_ptr, i); + if (nb_ptr) ba_mp->nodecard_loc[i] = - xstrdup(nodeboard->getLocation().c_str()); - } catch (const bgsched::InputException& err) { - int rc = bridge_handle_input_errors( - "Midplane::getNodeBoard", - err.getError().toValue(), NULL); - if (rc != SLURM_SUCCESS) - ; - } + xstrdup(nb_ptr->getLocation().c_str()); } } @@ -167,9 +151,18 @@ static bg_record_t * _translate_object_to_block(const Block::Ptr &block_ptr) bg_record->conn_type[0] = SELECT_SMALL; } else { for (Dimension dim=Dimension::A; dim<=Dimension::D; dim++) { - bg_record->conn_type[dim] = - block_ptr->isTorus(dim) ? - SELECT_TORUS : SELECT_MESH; + try { + bg_record->conn_type[dim] = + block_ptr->isTorus(dim) ? + SELECT_TORUS : SELECT_MESH; + } catch (const bgsched::InputException& err) { + bridge_handle_input_errors( + "Block::isTorus", + err.getError().toValue(), + NULL); + } catch (...) { + error("Unknown error from Block::isTorus."); + } } /* Set the bitmap blank here if it is a full node we don't want anything set we also @@ -269,13 +262,24 @@ static int _block_wait_for_jobs(char *bg_block_id) job_filter.setStatuses(&job_statuses); while (1) { - job_vec = getJobs(job_filter); - if (job_vec.empty()) - return SLURM_SUCCESS; - - BOOST_FOREACH(const Job::ConstPtr& job_ptr, job_vec) { - debug("waiting on mmcs job %lu to finish on block %s", - job_ptr->getId(), bg_block_id); + try { + job_vec = getJobs(job_filter); + if (job_vec.empty()) + return SLURM_SUCCESS; + + BOOST_FOREACH(const Job::ConstPtr& job_ptr, job_vec) { + debug("waiting on mmcs job %lu to " + "finish on block %s", + job_ptr->getId(), bg_block_id); + } + } catch (const bgsched::DatabaseException& err) { + bridge_handle_database_errors("getJobs", + err.getError().toValue()); + } catch (const bgsched::InternalException& err) { + bridge_handle_internal_errors("getJobs", + err.getError().toValue()); + } catch (...) { + error("Unknown error from getJobs."); } sleep(POLL_INTERVAL); } @@ -335,7 +339,15 @@ extern int bridge_init(char *properties_file) #ifdef HAVE_BG_FILES if (!properties_file) properties_file = (char *)""; - bgsched::init(properties_file); + try { + bgsched::init(properties_file); + } catch (const bgsched::InitializationException& err) { + bridge_handle_init_errors("bgsched::init", + err.getError().toValue()); + fatal("can't init bridge"); + } catch (...) { + fatal("Unknown error from bgsched::init, can't continue"); + } #endif initialized = true; @@ -358,9 +370,16 @@ extern int bridge_get_size(int *size) #ifdef HAVE_BG_FILES memset(size, 0, sizeof(int) * SYSTEM_DIMENSIONS); - Coordinates bgq_size = core::getMachineSize(); - for (int dim=0; dim< SYSTEM_DIMENSIONS; dim++) - size[dim] = bgq_size[dim]; + try { + Coordinates bgq_size = core::getMachineSize(); + for (int dim=0; dim< SYSTEM_DIMENSIONS; dim++) + size[dim] = bgq_size[dim]; + } catch (const bgsched::DatabaseException& err) { + bridge_handle_database_errors("core::getMachineSize", + err.getError().toValue()); + } catch (...) { + error("Unknown error from core::getMachineSize"); + } #endif return SLURM_SUCCESS; @@ -382,7 +401,7 @@ extern int bridge_setup_system() #ifdef HAVE_BG_FILES uint16_t coords[SYSTEM_DIMENSIONS]; - _setup_ba_mp(0, coords, getComputeHardware()); + _setup_ba_mp(0, coords, bridge_get_compute_hardware()); #endif return SLURM_SUCCESS; @@ -470,26 +489,22 @@ extern int bridge_block_create(bg_record_t *bg_record) try { block_ptr = Block::create(nodecards); + rc = SLURM_SUCCESS; } catch (const bgsched::InputException& err) { rc = bridge_handle_input_errors( "Block::createSmallBlock", err.getError().toValue(), bg_record); - if (rc != SLURM_SUCCESS) { - /* This is needed because sometimes we - get a sub midplane system with not - all the hardware there. This way - we can try to create blocks on all - the hardware and the good ones will - work and the bad ones will just be - removed after everything is done - being created. - */ - if (bg_conf->sub_mp_sys) - rc = SLURM_SUCCESS; - return rc; - } + } catch (const bgsched::RuntimeException& err) { + rc = bridge_handle_runtime_errors( + "Block::createSmallBlock", + err.getError().toValue(), + bg_record); + } catch (...) { + error("Unknown Error from Block::createSmallBlock"); + rc = SLURM_ERROR; } + } else { ListIterator itr = list_iterator_create(bg_record->ba_mp_list); while ((ba_mp = (ba_mp_t *)list_next(itr))) { @@ -523,39 +538,64 @@ extern int bridge_block_create(bg_record_t *bg_record) try { block_ptr = Block::create(midplanes, pt_midplanes, conn_type); + rc = SLURM_SUCCESS; } catch (const bgsched::InputException& err) { rc = bridge_handle_input_errors( "Block::create", err.getError().toValue(), bg_record); - if (rc != SLURM_SUCCESS) { - /* This is needed because sometimes we - get a sub midplane system with not - all the hardware there. This way - we can try to create blocks on all - the hardware and the good ones will - work and the bad ones will just be - removed after everything is done - being created. - */ - if (bg_conf->sub_mp_sys) - rc = SLURM_SUCCESS; - else - assert(0); - return rc; - } + } catch (...) { + error("Unknown Error from Block::createSmallBlock"); + rc = SLURM_ERROR; } } + if (rc != SLURM_SUCCESS) { + /* This is needed because sometimes we + get a sub midplane system with not + all the hardware there. This way + we can try to create blocks on all + the hardware and the good ones will + work and the bad ones will just be + removed after everything is done + being created. + */ + if (bg_conf->sub_mp_sys) + rc = SLURM_SUCCESS; + else if (bg_record->conn_type[0] != SELECT_SMALL) + assert(0); + return rc; + } + info("block created correctly"); - block_ptr->setName(bg_record->bg_block_id); - block_ptr->setMicroLoaderImage(bg_record->mloaderimage); + try { + block_ptr->setName(bg_record->bg_block_id); + } catch (const bgsched::InputException& err) { + rc = bridge_handle_input_errors("Block::setName", + err.getError().toValue(), + bg_record); + if (rc != SLURM_SUCCESS) + return rc; + } catch (...) { + error("Unknown error from Block::setName()."); + rc = SLURM_ERROR; + } + + try { + block_ptr->setMicroLoaderImage(bg_record->mloaderimage); + } catch (const bgsched::InputException& err) { + rc = bridge_handle_input_errors("Block::MicroLoaderImage", + err.getError().toValue(), + bg_record); + if (rc != SLURM_SUCCESS) + return rc; + } catch (...) { + error("Unknown error from Block::setMicroLoaderImage()."); + rc = SLURM_ERROR; + } try { block_ptr->add(""); - // block_ptr->addUser(bg_record->bg_block_id, - // bg_record->user_name); - //info("got past add"); } catch (const bgsched::InputException& err) { rc = bridge_handle_input_errors("Block::add", err.getError().toValue(), @@ -618,7 +658,7 @@ extern int bridge_block_boot(bg_record_t *bg_record) return rc; } catch (const bgsched::InternalException& err) { rc = bridge_handle_internal_errors("Block::checkIOLinksSummary", - err.getError().toValue()); + err.getError().toValue()); if (rc != SLURM_SUCCESS) return rc; } catch (...) { diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc index 082427a0d09..06e44cef10d 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc @@ -497,9 +497,31 @@ static void *_real_time(void *no_data) } if (rc == SLURM_SUCCESS) { - rt_client_ptr->setFilter(rt_filter, &filter_id, NULL); - rt_client_ptr->requestUpdates(NULL); - rt_client_ptr->receiveMessages(NULL, NULL, &failed); + /* receiveMessages will set this to false if + all is well. Otherwise we did fail. + */ + failed = true; + try { + rt_client_ptr->setFilter(rt_filter, &filter_id, + NULL); + rt_client_ptr->requestUpdates(NULL); + rt_client_ptr->receiveMessages(NULL, NULL, + &failed); + } catch (bgsched::realtime::ClientStateException& err) { + bridge_handle_input_errors( + "RealTime Setup", + err.getError().toValue(), NULL); + } catch (bgsched::realtime::ConnectionException& err) { + bridge_handle_input_errors( + "RealTime Setup", + err.getError().toValue(), NULL); + } catch (bgsched::realtime::ProtocolException& err) { + bridge_handle_input_errors( + "RealTime Setup", + err.getError().toValue(), NULL); + } catch (...) { + error("RealTime Setup: Unknown error thrown?"); + } } else failed = true; @@ -568,53 +590,63 @@ static void _do_block_poll(void) static void _handle_midplane_update(ComputeHardware::ConstPtr bgq, ba_mp_t *ba_mp) { - Coordinates::Coordinates coords(ba_mp->coord[A], ba_mp->coord[X], - ba_mp->coord[Y], ba_mp->coord[Z]); - Midplane::ConstPtr mp_ptr = bgq->getMidplane(coords); + Midplane::ConstPtr mp_ptr = bridge_get_midplane(bgq, ba_mp); int i; Dimension dim; + if (!mp_ptr) { + info("no midplane in the system at %s", ba_mp->coord_str); + return; + } + if (mp_ptr->getState() != Hardware::Available) { _handle_bad_midplane(ba_mp->coord_str, mp_ptr->getState()); /* no reason to continue */ return; } else { - Node::ConstPtrs vec = getMidplaneNodes(mp_ptr->getLocation()); - BOOST_FOREACH(const Node::ConstPtr& cnode_ptr, vec) { - _handle_node_change(ba_mp, - cnode_ptr->getLocation(), - cnode_ptr->getState()); + Node::ConstPtrs vec = bridge_get_midplane_nodes( + mp_ptr->getLocation()); + if (!vec.empty()) { + BOOST_FOREACH(const Node::ConstPtr& cnode_ptr, vec) { + _handle_node_change(ba_mp, + cnode_ptr->getLocation(), + cnode_ptr->getState()); + } } } for (i=0; i<16; i++) { - NodeBoard::ConstPtr nodeboard = mp_ptr->getNodeBoard(i); + NodeBoard::ConstPtr nb_ptr = bridge_get_nodeboard(mp_ptr, i); /* FIXME: the Hardware::Error can/should be taken away after IBM fixes it so when a cnode is in an error state it doesn't put the nodeboard in an error state as well. */ - if ((nodeboard->getState() != Hardware::Available) - && (nodeboard->getState() != Hardware::Error)) + if (nb_ptr && (nb_ptr->getState() != Hardware::Available) + && (nb_ptr->getState() != Hardware::Error)) _handle_bad_nodeboard( - nodeboard->getLocation().substr(7,3).c_str(), - ba_mp->coord_str, nodeboard->getState()); + nb_ptr->getLocation().substr(7,3).c_str(), + ba_mp->coord_str, nb_ptr->getState()); } for (dim=Dimension::A; dim<=Dimension::D; dim++) { - Switch::ConstPtr my_switch = mp_ptr->getSwitch(dim); - if (my_switch->getState() != Hardware::Available) - _handle_bad_switch(dim, - ba_mp->coord_str, - my_switch->getState()); - else { - Cable::ConstPtr my_cable = my_switch->getCable(); - /* Dimensions of length 1 do not have a - cable. (duh). - */ - if (my_cable) - _handle_cable_change(dim, ba_mp, - my_cable->getState()); + Switch::ConstPtr switch_ptr = bridge_get_switch(mp_ptr, dim); + if (switch_ptr) { + if (switch_ptr->getState() != Hardware::Available) + _handle_bad_switch(dim, + ba_mp->coord_str, + switch_ptr->getState()); + else { + Cable::ConstPtr my_cable = + switch_ptr->getCable(); + /* Dimensions of length 1 do not have a + cable. (duh). + */ + if (my_cable) + _handle_cable_change( + dim, ba_mp, + my_cable->getState()); + } } } } @@ -624,6 +656,11 @@ static void _do_hardware_poll(int level, uint16_t *coords, { ba_mp_t *ba_mp; + if (!bgqsys) { + error("_do_hardware_poll: No ComputeHardware ptr"); + return; + } + if (!ba_main_grid || (level > SYSTEM_DIMENSIONS)) return; @@ -661,7 +698,8 @@ static void *_poll(void *no_data) /* only do every 30 seconds */ if ((curr_time - 30) >= last_ran) { uint16_t coords[SYSTEM_DIMENSIONS]; - _do_hardware_poll(0, coords, getComputeHardware()); + _do_hardware_poll(0, coords, + bridge_get_compute_hardware()); last_ran = time(NULL); } @@ -688,7 +726,7 @@ void event_handler::handleRealtimeStartedRealtimeEvent( if (blocks_are_created) _do_block_poll(); /* only do every 30 seconds */ - _do_hardware_poll(0, coords, getComputeHardware()); + _do_hardware_poll(0, coords, bridge_get_compute_hardware()); } } @@ -953,8 +991,16 @@ extern int bridge_status_fini(void) bridge_status_inited = false; #if defined HAVE_BG_FILES slurm_mutex_lock(&rt_mutex); + /* make the rt connection end. */ - rt_client_ptr->disconnect(); + try { + rt_client_ptr->disconnect(); + } catch (bgsched::realtime::InternalErrorException& err) { + bridge_handle_realtime_internal_errors( + "realtime::disconnect", err.getError().toValue()); + } catch (...) { + error("Unknown error from realtime::disconnect"); + } if (kill_job_list) { list_destroy(kill_job_list); -- GitLab