From 039faf125989b838b7a5056767832a3919ee55a4 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@schedmd.com>
Date: Tue, 30 Aug 2011 11:38:05 -0700
Subject: [PATCH] BGQ - First cut at cable state change.  The API is changing
 here hence the commented out code, and the FIXME.

---
 .../select/bluegene/bl_bgq/bridge_status.cc   | 78 +++++++++++++++++--
 1 file changed, 71 insertions(+), 7 deletions(-)

diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
index 0511dd73fc7..dabdd31a53b 100644
--- a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
+++ b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
@@ -100,11 +100,11 @@ public:
 	virtual void handleNodeBoardStateChangedRealtimeEvent(
 		const NodeBoardStateChangedEventInfo& event);
 
-	// /*
-	//  * Handle a cable state changed real-time event.
-	//  */
-	// virtual void handleCableStateChangedRealtimeEvent(
-	// 	const CableStateChangedEventInfo& event);
+	/*
+	 * Handle a cable state changed real-time event.
+	 */
+	virtual void handleCableStateChangedRealtimeEvent(
+		const CableStateChangedEventInfo& event);
 
 } event_handler_t;
 
@@ -187,6 +187,31 @@ static void _handle_bad_nodeboard(const char *nb_name, const char* mp_coords,
 	return;
 }
 
+static void _handle_bad_cable(int dim, const char *mp_coords,
+			       EnumWrapper<Hardware::State> state)
+{
+	/* FIX ME: if the cable goes down you can probably still use
+	   the midplane.  This is a place holder of sorts until the
+	   real code can be done.
+	*/
+	char bg_down_node[128];
+
+	assert(mp_coords);
+
+	snprintf(bg_down_node, sizeof(bg_down_node), "%s%s",
+		 bg_conf->slurm_node_prefix, mp_coords);
+
+	if (!node_already_down(bg_down_node)) {
+		error("FIXME: Cable at dim '%d' on Midplane %s, "
+		      "state went to %d, marking midplane down.",
+		      dim, bg_down_node, state.toValue());
+		slurm_drain_nodes(bg_down_node,
+				  (char *)"select_bluegene: MMCS cable not UP",
+				  slurm_get_slurm_user_id());
+	}
+}
+
+
 void event_handler::handleBlockStateChangedRealtimeEvent(
         const BlockStateChangedEventInfo& event)
 {
@@ -284,6 +309,38 @@ void event_handler::handleNodeBoardStateChangedRealtimeEvent(
 	return;
 }
 
+void event_handler::handleCableStateChangedRealtimeEvent(
+	const CableStateChangedEventInfo& event)
+{
+	// const char *from_mp_name = event.getFromLocation().substr(0,6).c_str();
+	// const char *to_mp_name = event.getToLocation().substr(0,6).c_str();
+	// ba_mp_t *ba_mp = loc2ba_mp(mp_name);
+
+	// if (!ba_mp) {
+	// 	error("Cable  in dim '%d' on Midplane %s, state "
+	// 	      "went from %d to %d, but is not in our system",
+	// 	      dim, mp_name,
+	// 	      event.getPreviousState(),
+	// 	      event.getState());
+	// }
+
+	// if (event.getState() == Hardware::Available) {
+	// 	/* Don't do anything, wait for admin to fix things,
+	// 	 * just note things are better. */
+
+	// 	info("Switch in dim '%u' on Midplane %s, "
+	// 	     "has returned to service",
+	// 	     dim, mp_name);
+	// 	return;
+	// }
+
+	// /* Else mark the midplane down */
+	// _handle_bad_cable(dim, ba_mp->coord_str, event.getState());
+
+	return;
+}
+
+
 static int _real_time_connect(void)
 {
 	int rc = SLURM_ERROR;
@@ -318,6 +375,8 @@ static void *_real_time(void *no_data)
 	rt_filter.setSwitches(true);
 	rt_filter.setBlocks(true);
 
+	rt_filter.setCables(true);
+
 	block_statuses.insert(Block::Free);
 	block_statuses.insert(Block::Booting);
 	block_statuses.insert(Block::Initialized);
@@ -325,8 +384,6 @@ static void *_real_time(void *no_data)
 	rt_filter.setBlockStatuses(&block_statuses);
 
  	// rt_filter.get().setMidplanes(true);
- 	// rt_filter.get().setCables(true);
-
 	rt_client_ptr->addListener(event_hand);
 
 	rc = _real_time_connect();
@@ -434,6 +491,13 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq,
 			_handle_bad_switch(dim,
 					   ba_mp->coord_str,
 					   my_switch->getState());
+		else {
+			Cable::ConstPtr my_cable = my_switch->getCable();
+			if (my_cable->getState() != Hardware::Available)
+				_handle_bad_cable(dim,
+						  ba_mp->coord_str,
+						  my_switch->getState());
+		}
 	}
 }
 
-- 
GitLab