Skip to content
Snippets Groups Projects
Commit cf4f7b57 authored by Danny Auble's avatar Danny Auble
Browse files

BGQ - Handle new api change to the cable check done to split Torus from IO

cables. (Round 1)  This code sets things up to handle future changes when
a cable goes down.
parent bdf9b32b
No related branches found
No related tags found
No related merge requests found
...@@ -103,8 +103,8 @@ public: ...@@ -103,8 +103,8 @@ public:
/* /*
* Handle a cable state changed real-time event. * Handle a cable state changed real-time event.
*/ */
virtual void handleCableStateChangedRealtimeEvent( virtual void handleTorusCableStateChangedRealtimeEvent(
const CableStateChangedEventInfo& event); const TorusCableStateChangedEventInfo& event);
} event_handler_t; } event_handler_t;
...@@ -207,28 +207,26 @@ static void _handle_bad_nodeboard(const char *nb_name, const char* mp_coords, ...@@ -207,28 +207,26 @@ static void _handle_bad_nodeboard(const char *nb_name, const char* mp_coords,
return; return;
} }
static void _handle_bad_cable(int dim, const char *mp_coords, static void _handle_cable_change(int dim, ba_mp_t *ba_mp,
EnumWrapper<Hardware::State> state) EnumWrapper<Hardware::State> state)
{ {
/* FIX ME: if the cable goes down you can probably still use /* FIXME: uncomment this code when the block_allocator is
the midplane. This is a place holder of sorts until the ready to handle the new BG_SWITCH_CABLE_ERROR.
real code can be done.
*/ */
char bg_down_node[128]; // if (state == Hardware::Available) {
// /* no change */
assert(mp_coords); // if (!(ba_mp->axis_switch[dim] & BG_SWITCH_CABLE_ERROR))
// return;
snprintf(bg_down_node, sizeof(bg_down_node), "%s%s", // ba_mp->axis_switch[dim] &= (~BG_SWITCH_CABLE_ERROR);
bg_conf->slurm_node_prefix, mp_coords); // info("Cable in dim '%u' on Midplane %s(%s), "
// "has returned to service",
if (!node_already_down(bg_down_node)) { // dim, ba_mp->coord_str);
error("FIXME: Cable at dim '%d' on Midplane %s, " // } else if (!(ba_mp->axis_switch[dim] & BG_SWITCH_CABLE_ERROR)) {
"state went to %d, marking midplane down.", // ba_mp->axis_switch[dim] |= BG_SWITCH_CABLE_ERROR;
dim, bg_down_node, state.toValue()); // error("Cable at dim '%d' on Midplane %s, "
slurm_drain_nodes(bg_down_node, // "state went to %d, marking midplane down.",
(char *)"select_bluegene: MMCS cable not UP", // dim, ba_mp->coord_str, state.toValue());
slurm_get_slurm_user_id()); // }
}
} }
...@@ -378,33 +376,30 @@ void event_handler::handleNodeBoardStateChangedRealtimeEvent( ...@@ -378,33 +376,30 @@ void event_handler::handleNodeBoardStateChangedRealtimeEvent(
return; return;
} }
void event_handler::handleCableStateChangedRealtimeEvent( void event_handler::handleTorusCableStateChangedRealtimeEvent(
const CableStateChangedEventInfo& event) const TorusCableStateChangedEventInfo& event)
{ {
// const char *from_mp_name = event.getFromLocation().substr(0,6).c_str(); Coordinates ibm_coords = event.getFromMidplaneCoordinates();
// const char *to_mp_name = event.getToLocation().substr(0,6).c_str(); uint16_t coords[SYSTEM_DIMENSIONS];
// ba_mp_t *ba_mp = loc2ba_mp(mp_name); int dim;
ba_mp_t *from_ba_mp;
// if (!ba_mp) {
// error("Cable in dim '%d' on Midplane %s, state "
// "went from %d to %d, but is not in our system",
// dim, mp_name,
// event.getPreviousState(),
// event.getState());
// }
// if (event.getState() == Hardware::Available) {
// /* Don't do anything, wait for admin to fix things,
// * just note things are better. */
// info("Switch in dim '%u' on Midplane %s, " for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++)
// "has returned to service", coords[dim] = ibm_coords[dim];
// dim, mp_name);
// return;
// }
// /* Else mark the midplane down */ dim = event.getDimension();
// _handle_bad_cable(dim, ba_mp->coord_str, event.getState()); from_ba_mp = coord2ba_mp(coords);
if (!from_ba_mp) {
error("Cable in dim '%d' on Midplane %s, state "
"went from %d to %d, but is not in our system",
dim, event.getFromMidplaneLocation().c_str(),
event.getPreviousState(),
event.getState());
}
/* Else mark the midplane down */
_handle_cable_change(dim, from_ba_mp, event.getState());
return; return;
} }
...@@ -445,7 +440,7 @@ static void *_real_time(void *no_data) ...@@ -445,7 +440,7 @@ static void *_real_time(void *no_data)
rt_filter.setBlocks(true); rt_filter.setBlocks(true);
rt_filter.setMidplanes(true); rt_filter.setMidplanes(true);
rt_filter.setCables(true); rt_filter.setTorusCables(true);
block_statuses.insert(Block::Free); block_statuses.insert(Block::Free);
block_statuses.insert(Block::Booting); block_statuses.insert(Block::Booting);
...@@ -566,10 +561,8 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq, ...@@ -566,10 +561,8 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq,
my_switch->getState()); my_switch->getState());
else { else {
Cable::ConstPtr my_cable = my_switch->getCable(); Cable::ConstPtr my_cable = my_switch->getCable();
if (my_cable->getState() != Hardware::Available) _handle_cable_change(dim, ba_mp,
_handle_bad_cable(dim, my_switch->getState());
ba_mp->coord_str,
my_switch->getState());
} }
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment