From 5fa28b7f7653ba812b877f0fc3e2b5e6f46444aa Mon Sep 17 00:00:00 2001
From: Danny Auble <da@schedmd.com>
Date: Wed, 26 Oct 2011 15:13:28 -0700
Subject: [PATCH] BGQ - better error checking for when API throws errors

---
 .../select/bluegene/bl_bgq/bridge_helper.cc   |  93 ++++++++-
 .../select/bluegene/bl_bgq/bridge_helper.h    |   7 +
 .../select/bluegene/bl_bgq/bridge_linker.cc   | 190 +++++++++++-------
 .../select/bluegene/bl_bgq/bridge_status.cc   | 110 +++++++---
 4 files changed, 292 insertions(+), 108 deletions(-)

diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc b/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc
index 1cbc3984a5d..09345356cb7 100644
--- a/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc
+++ b/src/plugins/select/bluegene/bl_bgq/bridge_helper.cc
@@ -487,7 +487,7 @@ extern Block::Ptrs bridge_get_blocks(BlockFilter filter)
 	Block::Ptrs vec;
 
 	try {
-		vec = getBlocks(filter, BlockSort::AnyOrder);
+		vec = getBlocks(filter);
 	} catch (const bgsched::DatabaseException& err) {
 		bridge_handle_database_errors("getBlocks",
 					      err.getError().toValue());
@@ -505,4 +505,95 @@ extern Block::Ptrs bridge_get_blocks(BlockFilter filter)
 	return vec;
 }
 
+extern Midplane::ConstPtr bridge_get_midplane(ComputeHardware::ConstPtr bgqsys,
+					      ba_mp_t *ba_mp)
+{
+	Midplane::ConstPtr mp_ptr;
+
+	assert(ba_mp);
+
+	try {
+		Coordinates::Coordinates coords(
+			ba_mp->coord[0], ba_mp->coord[1],
+			ba_mp->coord[2], ba_mp->coord[3]);
+		mp_ptr = bgqsys->getMidplane(coords);
+	} catch (const bgsched::InputException& err) {
+		bridge_handle_input_errors(
+			"ComputeHardware::getMidplane",
+			err.getError().toValue(), NULL);
+	} catch (...) {
+              error("Unknown error from ComputeHardware::getMidplane.");
+	}
+	return mp_ptr;
+}
+
+extern Node::ConstPtrs bridge_get_midplane_nodes(const std::string& loc)
+{
+	Node::ConstPtrs vec;
+
+	try {
+		vec = getMidplaneNodes(loc);
+	} catch (const bgsched::DatabaseException& err) {
+		bridge_handle_database_errors("getMidplaneNodes",
+					      err.getError().toValue());
+	} catch (const bgsched::InputException& err) {
+		bridge_handle_input_errors("getMidplaneNodes",
+					   err.getError().toValue(),
+					   NULL);
+	} catch (const bgsched::InternalException& err) {
+		bridge_handle_internal_errors("getMidplaneNodes",
+						   err.getError().toValue());
+	} catch (...) {
+                error("Unknown error from getMidplaneNodes.");
+	}
+	return vec;
+}
+
+extern NodeBoard::ConstPtr bridge_get_nodeboard(Midplane::ConstPtr mp_ptr,
+						int nodeboard_num)
+{
+	NodeBoard::ConstPtr nb_ptr;
+
+	try {
+		nb_ptr = mp_ptr->getNodeBoard(nodeboard_num);
+	} catch (const bgsched::InputException& err) {
+		bridge_handle_input_errors("Midplane::getNodeBoard",
+					   err.getError().toValue(),
+					   NULL);
+	} catch (...) {
+                error("Unknown error from Midplane::getNodeBoard.");
+	}
+	return nb_ptr;
+}
+
+extern Switch::ConstPtr bridge_get_switch(Midplane::ConstPtr mp_ptr, int dim)
+{
+	Switch::ConstPtr switch_ptr;
+
+	try {
+		switch_ptr = mp_ptr->getSwitch(dim);
+	} catch (const bgsched::InputException& err) {
+		bridge_handle_input_errors("Midplane::getSwitch",
+					   err.getError().toValue(),
+					   NULL);
+	} catch (...) {
+                error("Unknown error from Midplane::getSwitch.");
+	}
+	return switch_ptr;
+}
+
+extern ComputeHardware::ConstPtr bridge_get_compute_hardware()
+{
+	ComputeHardware::ConstPtr bgqsys;
+
+	try {
+		bgqsys = getComputeHardware();
+	} catch (const bgsched::InternalException& err) {
+		bridge_handle_internal_errors("getComputeHardware",
+					      err.getError().toValue());
+	} catch (...) {
+		error("Unknown error from getComputeHardware");
+	}
+	return bgqsys;
+}
 #endif
diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_helper.h b/src/plugins/select/bluegene/bl_bgq/bridge_helper.h
index 750acce913e..0889a9733ff 100644
--- a/src/plugins/select/bluegene/bl_bgq/bridge_helper.h
+++ b/src/plugins/select/bluegene/bl_bgq/bridge_helper.h
@@ -103,6 +103,13 @@ extern const char *bridge_hardware_state_string(const int state);
 
 /* helper functions */
 extern Block::Ptrs bridge_get_blocks(BlockFilter filter);
+extern Midplane::ConstPtr bridge_get_midplane(ComputeHardware::ConstPtr bgqsys,
+					      ba_mp_t *ba_mp);
+extern Node::ConstPtrs bridge_get_midplane_nodes(const std::string& loc);
+extern NodeBoard::ConstPtr bridge_get_nodeboard(Midplane::ConstPtr mp_ptr,
+						int nodeboard_num);
+extern Switch::ConstPtr bridge_get_switch(Midplane::ConstPtr mp_ptr, int dim);
+extern ComputeHardware::ConstPtr bridge_get_compute_hardware();
 
 #endif
 
diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc b/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc
index b39d4cead4f..fa8fcaa538d 100644
--- a/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc
+++ b/src/plugins/select/bluegene/bl_bgq/bridge_linker.cc
@@ -76,6 +76,9 @@ static void _setup_ba_mp(int level, uint16_t *coords,
 	Midplane::ConstPtr mp_ptr;
 	int i;
 
+	if (!bgqsys)
+		fatal("_setup_ba_mp: No ComputeHardware ptr");
+
 	if (level > SYSTEM_DIMENSIONS)
 		return;
 
@@ -89,38 +92,19 @@ static void _setup_ba_mp(int level, uint16_t *coords,
 		return;
 	}
 
-	if (!(ba_mp = coord2ba_mp(coords)))
+	if (!(ba_mp = coord2ba_mp(coords))
+	    || !(mp_ptr = bridge_get_midplane(bgqsys, ba_mp)))
 		return;
 
-	try {
-		Coordinates::Coordinates coords(
-			ba_mp->coord[A], ba_mp->coord[X],
-			ba_mp->coord[Y], ba_mp->coord[Z]);
-		mp_ptr = bgqsys->getMidplane(coords);
-	} catch (const bgsched::InputException& err) {
-		int rc = bridge_handle_input_errors(
-			"ComputeHardware::getMidplane",
-			err.getError().toValue(), NULL);
-		if (rc != SLURM_SUCCESS)
-			return;
-	}
-
 	ba_mp->loc = xstrdup(mp_ptr->getLocation().c_str());
 
 	ba_mp->nodecard_loc =
 		(char **)xmalloc(sizeof(char *) * bg_conf->mp_nodecard_cnt);
 	for (i=0; i<bg_conf->mp_nodecard_cnt; i++) {
-		try {
-			NodeBoard::ConstPtr nodeboard = mp_ptr->getNodeBoard(i);
+		NodeBoard::ConstPtr nb_ptr = bridge_get_nodeboard(mp_ptr, i);
+		if (nb_ptr)
 			ba_mp->nodecard_loc[i] =
-				xstrdup(nodeboard->getLocation().c_str());
-		} catch (const bgsched::InputException& err) {
-			int rc = bridge_handle_input_errors(
-				"Midplane::getNodeBoard",
-				err.getError().toValue(), NULL);
-			if (rc != SLURM_SUCCESS)
-			       ;
-		}
+				xstrdup(nb_ptr->getLocation().c_str());
 	}
 }
 
@@ -167,9 +151,18 @@ static bg_record_t * _translate_object_to_block(const Block::Ptr &block_ptr)
 		bg_record->conn_type[0] = SELECT_SMALL;
 	} else {
 		for (Dimension dim=Dimension::A; dim<=Dimension::D; dim++) {
-			bg_record->conn_type[dim] =
-				block_ptr->isTorus(dim) ?
-				SELECT_TORUS : SELECT_MESH;
+			try {
+				bg_record->conn_type[dim] =
+					block_ptr->isTorus(dim) ?
+					SELECT_TORUS : SELECT_MESH;
+			} catch (const bgsched::InputException& err) {
+				bridge_handle_input_errors(
+					"Block::isTorus",
+					err.getError().toValue(),
+					NULL);
+			} catch (...) {
+				error("Unknown error from Block::isTorus.");
+			}
 		}
 		/* Set the bitmap blank here if it is a full
 		   node we don't want anything set we also
@@ -269,13 +262,24 @@ static int _block_wait_for_jobs(char *bg_block_id)
 	job_filter.setStatuses(&job_statuses);
 
 	while (1) {
-		job_vec = getJobs(job_filter);
-		if (job_vec.empty())
-			return SLURM_SUCCESS;
-
-		BOOST_FOREACH(const Job::ConstPtr& job_ptr, job_vec) {
-			debug("waiting on mmcs job %lu to finish on block %s",
-			      job_ptr->getId(), bg_block_id);
+		try {
+			job_vec = getJobs(job_filter);
+			if (job_vec.empty())
+				return SLURM_SUCCESS;
+
+			BOOST_FOREACH(const Job::ConstPtr& job_ptr, job_vec) {
+				debug("waiting on mmcs job %lu to "
+				      "finish on block %s",
+				      job_ptr->getId(), bg_block_id);
+			}
+		} catch (const bgsched::DatabaseException& err) {
+			bridge_handle_database_errors("getJobs",
+						      err.getError().toValue());
+		} catch (const bgsched::InternalException& err) {
+			bridge_handle_internal_errors("getJobs",
+						      err.getError().toValue());
+		} catch (...) {
+			error("Unknown error from getJobs.");
 		}
 		sleep(POLL_INTERVAL);
 	}
@@ -335,7 +339,15 @@ extern int bridge_init(char *properties_file)
 #ifdef HAVE_BG_FILES
 	if (!properties_file)
 		properties_file = (char *)"";
-	bgsched::init(properties_file);
+	try {
+		bgsched::init(properties_file);
+	} catch (const bgsched::InitializationException& err) {
+		bridge_handle_init_errors("bgsched::init",
+					  err.getError().toValue());
+		fatal("can't init bridge");
+	} catch (...) {
+		fatal("Unknown error from bgsched::init, can't continue");
+	}
 #endif
 	initialized = true;
 
@@ -358,9 +370,16 @@ extern int bridge_get_size(int *size)
 #ifdef HAVE_BG_FILES
 	memset(size, 0, sizeof(int) * SYSTEM_DIMENSIONS);
 
-	Coordinates bgq_size = core::getMachineSize();
-	for (int dim=0; dim< SYSTEM_DIMENSIONS; dim++)
-		size[dim] = bgq_size[dim];
+	try {
+		Coordinates bgq_size = core::getMachineSize();
+		for (int dim=0; dim< SYSTEM_DIMENSIONS; dim++)
+			size[dim] = bgq_size[dim];
+	} catch (const bgsched::DatabaseException& err) {
+		bridge_handle_database_errors("core::getMachineSize",
+					      err.getError().toValue());
+	} catch (...) {
+		error("Unknown error from core::getMachineSize");
+	}
 #endif
 
 	return SLURM_SUCCESS;
@@ -382,7 +401,7 @@ extern int bridge_setup_system()
 
 #ifdef HAVE_BG_FILES
 	uint16_t coords[SYSTEM_DIMENSIONS];
-	_setup_ba_mp(0, coords, getComputeHardware());
+	_setup_ba_mp(0, coords, bridge_get_compute_hardware());
 #endif
 
 	return SLURM_SUCCESS;
@@ -470,26 +489,22 @@ extern int bridge_block_create(bg_record_t *bg_record)
 
 		try {
 			block_ptr = Block::create(nodecards);
+			rc = SLURM_SUCCESS;
 		} catch (const bgsched::InputException& err) {
 			rc = bridge_handle_input_errors(
 				"Block::createSmallBlock",
 				err.getError().toValue(),
 				bg_record);
-			if (rc != SLURM_SUCCESS) {
-				/* This is needed because sometimes we
-				   get a sub midplane system with not
-				   all the hardware there.  This way
-				   we can try to create blocks on all
-				   the hardware and the good ones will
-				   work and the bad ones will just be
-				   removed after everything is done
-				   being created.
-				*/
-				if (bg_conf->sub_mp_sys)
-					rc = SLURM_SUCCESS;
-				return rc;
-			}
+		} catch (const bgsched::RuntimeException& err) {
+			rc = bridge_handle_runtime_errors(
+				"Block::createSmallBlock",
+				err.getError().toValue(),
+				bg_record);
+		} catch (...) {
+			error("Unknown Error from Block::createSmallBlock");
+			rc = SLURM_ERROR;
 		}
+
 	} else {
 		ListIterator itr = list_iterator_create(bg_record->ba_mp_list);
 		while ((ba_mp = (ba_mp_t *)list_next(itr))) {
@@ -523,39 +538,64 @@ extern int bridge_block_create(bg_record_t *bg_record)
 		try {
 			block_ptr = Block::create(midplanes,
 						  pt_midplanes, conn_type);
+			rc = SLURM_SUCCESS;
 		} catch (const bgsched::InputException& err) {
 			rc = bridge_handle_input_errors(
 				"Block::create",
 				err.getError().toValue(),
 				bg_record);
-			if (rc != SLURM_SUCCESS) {
-				/* This is needed because sometimes we
-				   get a sub midplane system with not
-				   all the hardware there.  This way
-				   we can try to create blocks on all
-				   the hardware and the good ones will
-				   work and the bad ones will just be
-				   removed after everything is done
-				   being created.
-				*/
-				if (bg_conf->sub_mp_sys)
-					rc = SLURM_SUCCESS;
-				else
-					assert(0);
-				return rc;
-			}
+		} catch (...) {
+			error("Unknown Error from Block::createSmallBlock");
+			rc = SLURM_ERROR;
 		}
 	}
 
+	if (rc != SLURM_SUCCESS) {
+		/* This is needed because sometimes we
+		   get a sub midplane system with not
+		   all the hardware there.  This way
+		   we can try to create blocks on all
+		   the hardware and the good ones will
+		   work and the bad ones will just be
+		   removed after everything is done
+		   being created.
+		*/
+		if (bg_conf->sub_mp_sys)
+			rc = SLURM_SUCCESS;
+		else if (bg_record->conn_type[0] != SELECT_SMALL)
+			assert(0);
+		return rc;
+	}
+
 	info("block created correctly");
-	block_ptr->setName(bg_record->bg_block_id);
-	block_ptr->setMicroLoaderImage(bg_record->mloaderimage);
+	try {
+		block_ptr->setName(bg_record->bg_block_id);
+	} catch (const bgsched::InputException& err) {
+		rc = bridge_handle_input_errors("Block::setName",
+						err.getError().toValue(),
+						bg_record);
+		if (rc != SLURM_SUCCESS)
+			return rc;
+	} catch (...) {
+                error("Unknown error from Block::setName().");
+		rc = SLURM_ERROR;
+	}
+
+	try {
+		block_ptr->setMicroLoaderImage(bg_record->mloaderimage);
+	} catch (const bgsched::InputException& err) {
+		rc = bridge_handle_input_errors("Block::MicroLoaderImage",
+						err.getError().toValue(),
+						bg_record);
+		if (rc != SLURM_SUCCESS)
+			return rc;
+	} catch (...) {
+                error("Unknown error from Block::setMicroLoaderImage().");
+		rc = SLURM_ERROR;
+	}
 
 	try {
 		block_ptr->add("");
-		// block_ptr->addUser(bg_record->bg_block_id,
-		// 		   bg_record->user_name);
-		//info("got past add");
 	} catch (const bgsched::InputException& err) {
 		rc = bridge_handle_input_errors("Block::add",
 						err.getError().toValue(),
@@ -618,7 +658,7 @@ extern int bridge_block_boot(bg_record_t *bg_record)
 			return rc;
 	} catch (const bgsched::InternalException& err) {
 		rc = bridge_handle_internal_errors("Block::checkIOLinksSummary",
-						err.getError().toValue());
+						   err.getError().toValue());
 		if (rc != SLURM_SUCCESS)
 			return rc;
 	} catch (...) {
diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
index 082427a0d09..06e44cef10d 100644
--- a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
+++ b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
@@ -497,9 +497,31 @@ static void *_real_time(void *no_data)
 		}
 
 		if (rc == SLURM_SUCCESS) {
-			rt_client_ptr->setFilter(rt_filter, &filter_id, NULL);
-			rt_client_ptr->requestUpdates(NULL);
-			rt_client_ptr->receiveMessages(NULL, NULL, &failed);
+			/* receiveMessages will set this to false if
+			   all is well.  Otherwise we did fail.
+			*/
+			failed = true;
+			try {
+				rt_client_ptr->setFilter(rt_filter, &filter_id,
+							 NULL);
+				rt_client_ptr->requestUpdates(NULL);
+				rt_client_ptr->receiveMessages(NULL, NULL,
+							       &failed);
+			} catch (bgsched::realtime::ClientStateException& err) {
+				bridge_handle_input_errors(
+					"RealTime Setup",
+					err.getError().toValue(), NULL);
+			} catch (bgsched::realtime::ConnectionException& err) {
+				bridge_handle_input_errors(
+					"RealTime Setup",
+					err.getError().toValue(), NULL);
+			} catch (bgsched::realtime::ProtocolException& err) {
+				bridge_handle_input_errors(
+					"RealTime Setup",
+					err.getError().toValue(), NULL);
+			} catch (...) {
+				error("RealTime Setup: Unknown error thrown?");
+			}
 		} else
 			failed = true;
 
@@ -568,53 +590,63 @@ static void _do_block_poll(void)
 static void _handle_midplane_update(ComputeHardware::ConstPtr bgq,
 				    ba_mp_t *ba_mp)
 {
-	Coordinates::Coordinates coords(ba_mp->coord[A], ba_mp->coord[X],
-					ba_mp->coord[Y], ba_mp->coord[Z]);
-	Midplane::ConstPtr mp_ptr = bgq->getMidplane(coords);
+	Midplane::ConstPtr mp_ptr = bridge_get_midplane(bgq, ba_mp);
 	int i;
 	Dimension dim;
 
+	if (!mp_ptr) {
+		info("no midplane in the system at %s", ba_mp->coord_str);
+		return;
+	}
+
 	if (mp_ptr->getState() != Hardware::Available) {
 		_handle_bad_midplane(ba_mp->coord_str, mp_ptr->getState());
 		/* no reason to continue */
 		return;
 	} else {
-		Node::ConstPtrs vec = getMidplaneNodes(mp_ptr->getLocation());
-		BOOST_FOREACH(const Node::ConstPtr& cnode_ptr, vec) {
-			_handle_node_change(ba_mp,
-					    cnode_ptr->getLocation(),
-					    cnode_ptr->getState());
+		Node::ConstPtrs vec = bridge_get_midplane_nodes(
+			mp_ptr->getLocation());
+		if (!vec.empty()) {
+			BOOST_FOREACH(const Node::ConstPtr& cnode_ptr, vec) {
+				_handle_node_change(ba_mp,
+						    cnode_ptr->getLocation(),
+						    cnode_ptr->getState());
+			}
 		}
 	}
 
 	for (i=0; i<16; i++) {
-		NodeBoard::ConstPtr nodeboard = mp_ptr->getNodeBoard(i);
+		NodeBoard::ConstPtr nb_ptr = bridge_get_nodeboard(mp_ptr, i);
 		/* FIXME: the Hardware::Error can/should be taken away after
 		   IBM fixes it so when a cnode is in an error state
 		   it doesn't put the nodeboard in an error state as
 		   well.
 		*/
-		if ((nodeboard->getState() != Hardware::Available)
-		    && (nodeboard->getState() != Hardware::Error))
+		if (nb_ptr && (nb_ptr->getState() != Hardware::Available)
+		    && (nb_ptr->getState() != Hardware::Error))
 			_handle_bad_nodeboard(
-				nodeboard->getLocation().substr(7,3).c_str(),
-				ba_mp->coord_str, nodeboard->getState());
+				nb_ptr->getLocation().substr(7,3).c_str(),
+				ba_mp->coord_str, nb_ptr->getState());
 	}
 
 	for (dim=Dimension::A; dim<=Dimension::D; dim++) {
-		Switch::ConstPtr my_switch = mp_ptr->getSwitch(dim);
-		if (my_switch->getState() != Hardware::Available)
-			_handle_bad_switch(dim,
-					   ba_mp->coord_str,
-					   my_switch->getState());
-		else {
-			Cable::ConstPtr my_cable = my_switch->getCable();
-			/* Dimensions of length 1 do not have a
-			   cable. (duh).
-			*/
-			if (my_cable)
-				_handle_cable_change(dim, ba_mp,
-						     my_cable->getState());
+		Switch::ConstPtr switch_ptr = bridge_get_switch(mp_ptr, dim);
+		if (switch_ptr) {
+			if (switch_ptr->getState() != Hardware::Available)
+				_handle_bad_switch(dim,
+						   ba_mp->coord_str,
+						   switch_ptr->getState());
+			else {
+				Cable::ConstPtr my_cable =
+					switch_ptr->getCable();
+				/* Dimensions of length 1 do not have a
+				   cable. (duh).
+				*/
+				if (my_cable)
+					_handle_cable_change(
+						dim, ba_mp,
+						my_cable->getState());
+			}
 		}
 	}
 }
@@ -624,6 +656,11 @@ static void _do_hardware_poll(int level, uint16_t *coords,
 {
 	ba_mp_t *ba_mp;
 
+	if (!bgqsys) {
+		error("_do_hardware_poll: No ComputeHardware ptr");
+		return;
+	}
+
 	if (!ba_main_grid || (level > SYSTEM_DIMENSIONS))
 		return;
 
@@ -661,7 +698,8 @@ static void *_poll(void *no_data)
 		/* only do every 30 seconds */
 		if ((curr_time - 30) >= last_ran) {
 			uint16_t coords[SYSTEM_DIMENSIONS];
-			_do_hardware_poll(0, coords, getComputeHardware());
+			_do_hardware_poll(0, coords,
+					  bridge_get_compute_hardware());
 			last_ran = time(NULL);
 		}
 
@@ -688,7 +726,7 @@ void event_handler::handleRealtimeStartedRealtimeEvent(
 		if (blocks_are_created)
 			_do_block_poll();
 		/* only do every 30 seconds */
-		_do_hardware_poll(0, coords, getComputeHardware());
+		_do_hardware_poll(0, coords, bridge_get_compute_hardware());
 	}
 }
 
@@ -953,8 +991,16 @@ extern int bridge_status_fini(void)
 	bridge_status_inited = false;
 #if defined HAVE_BG_FILES
 	slurm_mutex_lock(&rt_mutex);
+
 	/* make the rt connection end. */
-	rt_client_ptr->disconnect();
+	try {
+		rt_client_ptr->disconnect();
+	} catch (bgsched::realtime::InternalErrorException& err) {
+		bridge_handle_realtime_internal_errors(
+			"realtime::disconnect", err.getError().toValue());
+	} catch (...) {
+		error("Unknown error from realtime::disconnect");
+	}
 
 	if (kill_job_list) {
 		list_destroy(kill_job_list);
-- 
GitLab