From 4461a249b0e7e8e98dfba5baeb8076719f86f8b9 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@schedmd.com>
Date: Tue, 30 Aug 2011 11:39:08 -0700
Subject: [PATCH] BGQ - Handle midplane state changes.

---
 .../select/bluegene/bl_bgq/bridge_status.cc   | 61 ++++++++++++++++++-
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
index dabdd31a53b..8b7a1a8ddc4 100644
--- a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
+++ b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc
@@ -114,6 +114,26 @@ static pthread_t poll_thread;
 static bgsched::realtime::Client *rt_client_ptr = NULL;
 pthread_mutex_t rt_mutex = PTHREAD_MUTEX_INITIALIZER;
 
+static void _handle_bad_midplane(const char *mp_coords,
+				 EnumWrapper<Hardware::State> state)
+{
+	char bg_down_node[128];
+
+	assert(mp_coords);
+
+	snprintf(bg_down_node, sizeof(bg_down_node), "%s%s",
+		 bg_conf->slurm_node_prefix, mp_coords);
+
+	if (!node_already_down(bg_down_node)) {
+		error("Midplane %s, state went to %d, marking midplane down.",
+		      bg_down_node, state.toValue());
+		slurm_drain_nodes(
+			bg_down_node,
+			(char *)"select_bluegene: MMCS midplane not UP",
+			slurm_get_slurm_user_id());
+	}
+}
+
 static void _handle_bad_switch(int dim, const char *mp_coords,
 			       EnumWrapper<Hardware::State> state)
 {
@@ -244,7 +264,38 @@ void event_handler::handleBlockStateChangedRealtimeEvent(
 void event_handler::handleMidplaneStateChangedRealtimeEvent(
 	const MidplaneStateChangedEventInfo& event)
 {
-//	const char *midplane = event.getMidplaneId().c_str();
+	Coordinates ibm_coords = event.getMidplaneCoordinates();
+	uint16_t coords[SYSTEM_DIMENSIONS];
+	ba_mp_t *ba_mp;
+	int dim;
+
+	for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++)
+		coords[dim] = ibm_coords[dim];
+
+	ba_mp = coord2ba_mp(coords);
+
+	if (!ba_mp) {
+		error("Midplane %s, state went from %d to %d, "
+		      "but is not in our system",
+		      event.getLocation().c_str(),
+		      event.getPreviousState(),
+		      event.getState());
+	}
+
+	if (event.getState() == Hardware::Available) {
+		/* Don't do anything, wait for admin to fix things,
+		 * just note things are better. */
+
+		info("Midplane %s(%s), has returned to service",
+		     event.getLocation().c_str(),
+		     ba_mp->coord_str);
+		return;
+	}
+
+	/* Else mark the midplane down */
+	_handle_bad_midplane(ba_mp->coord_str, event.getState());
+
+	return;
 
 }
 
@@ -375,6 +426,7 @@ static void *_real_time(void *no_data)
 	rt_filter.setSwitches(true);
 	rt_filter.setBlocks(true);
 
+	rt_filter.setMidplanes(true);
 	rt_filter.setCables(true);
 
 	block_statuses.insert(Block::Free);
@@ -383,7 +435,6 @@ static void *_real_time(void *no_data)
 	block_statuses.insert(Block::Terminating);
 	rt_filter.setBlockStatuses(&block_statuses);
 
- 	// rt_filter.get().setMidplanes(true);
 	rt_client_ptr->addListener(event_hand);
 
 	rc = _real_time_connect();
@@ -477,6 +528,12 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq,
 	int i;
 	Dimension dim;
 
+	if (mp_ptr->getState() != Hardware::Available) {
+		_handle_bad_midplane(ba_mp->coord_str, mp_ptr->getState());
+		/* no reason to continue */
+		return;
+	}
+
 	for (i=0; i<16; i++) {
 		NodeBoard::ConstPtr nodeboard = mp_ptr->getNodeBoard(i);
 		if (nodeboard->getState() != Hardware::Available)
-- 
GitLab