Skip to content
Snippets Groups Projects
Commit f63a61f2 authored by Danny Auble's avatar Danny Auble
Browse files

BGQ - Added support for handling state change in cables

parent 3885748f
No related branches found
No related tags found
No related merge requests found
......@@ -581,6 +581,15 @@ extern int check_and_set_mp_list(List mps)
if (ba_switch->usage == BG_SWITCH_NONE)
continue;
else if ((ba_switch->usage == BG_SWITCH_CABLE_ERROR)
|| (ba_switch->usage ==
BG_SWITCH_CABLE_ERROR_SET)) {
error("check_and_set_mp_list: Somehow we got "
"a switch with an error set in it. "
"This should never happen.");
continue;
}
if (ba_switch->usage & curr_ba_switch->usage) {
if (ba_debug_flags & DEBUG_FLAG_BG_ALGO_DEEP)
......
......@@ -969,10 +969,22 @@ extern void ba_setup_mp(ba_mp_t *ba_mp, bool track_down_mps, bool wrap_it)
ba_mp->axis_switch[i].int_wire[j].port_tar = j;
}
#endif
if (ba_mp->axis_switch[i].usage & BG_SWITCH_CABLE_ERROR)
set_error = 1;
if (wrap_it)
ba_mp->axis_switch[i].usage = BG_SWITCH_WRAPPED;
else
ba_mp->axis_switch[i].usage = BG_SWITCH_NONE;
if (set_error) {
if (track_down_mps)
ba_mp->axis_switch[i].usage
|= BG_SWITCH_CABLE_ERROR_SET;
else
ba_mp->axis_switch[i].usage
|= BG_SWITCH_CABLE_ERROR;
}
ba_mp->alter_switch[i].usage = BG_SWITCH_NONE;
}
}
......@@ -1491,8 +1503,16 @@ extern int validate_coord(uint16_t *coord)
extern char *ba_switch_usage_str(uint16_t usage)
{
switch (usage) {
bool error_set = (usage & BG_SWITCH_CABLE_ERROR);
uint16_t local_usage = usage;
if (error_set)
local_usage &= (~BG_SWITCH_CABLE_ERROR_SET);
switch (local_usage) {
case BG_SWITCH_NONE:
if (error_set)
return "ErrorOut";
return "None";
case BG_SWITCH_WRAPPED_PASS:
return "WrappedPass";
......@@ -1501,14 +1521,20 @@ extern char *ba_switch_usage_str(uint16_t usage)
case BG_SWITCH_PASS:
return "Passthrough";
case BG_SWITCH_WRAPPED:
if (error_set)
return "Wrapped,ErrorOut";
return "Wrapped";
case (BG_SWITCH_OUT | BG_SWITCH_OUT_PASS):
return "OutLeaving";
case BG_SWITCH_OUT:
return "Out";
case (BG_SWITCH_IN | BG_SWITCH_IN_PASS):
if (error_set)
return "InComming,ErrorOut";
return "InComming";
case BG_SWITCH_IN:
if (error_set)
return "In,ErrorOut";
return "In";
default:
error("unknown switch usage %u", usage);
......
......@@ -126,6 +126,17 @@ typedef enum {
#define BG_SWITCH_PASS 0x001C /* just passthough used */
#define BG_SWITCH_WRAPPED_PASS 0x001F /* all ports are in use, but no torus */
#define BG_SWITCH_TORUS 0x000F /* all ports are in use in a torus */
#define BG_SWITCH_CABLE_ERROR 0x0100 /* Flag to notify cable is in a
* error state.
*/
#define BG_SWITCH_CABLE_ERROR_SET 0x0104 /* If a cable goes into an error
* state we set the cable in
* an error and the OUT_PASS
* as well.
* Currently SLURM only really
* cares about the out port of a
* switch.
*/
#define BG_SWITCH_START 0x0200 /* modified from the start list */
/*
......
......@@ -210,23 +210,71 @@ static void _handle_bad_nodeboard(const char *nb_name, const char* mp_coords,
static void _handle_cable_change(int dim, ba_mp_t *ba_mp,
EnumWrapper<Hardware::State> state)
{
/* FIXME: uncomment this code when the block_allocator is
ready to handle the new BG_SWITCH_CABLE_ERROR.
*/
// if (state == Hardware::Available) {
// /* no change */
// if (!(ba_mp->axis_switch[dim] & BG_SWITCH_CABLE_ERROR))
// return;
// ba_mp->axis_switch[dim] &= (~BG_SWITCH_CABLE_ERROR);
// info("Cable in dim '%u' on Midplane %s(%s), "
// "has returned to service",
// dim, ba_mp->coord_str);
// } else if (!(ba_mp->axis_switch[dim] & BG_SWITCH_CABLE_ERROR)) {
// ba_mp->axis_switch[dim] |= BG_SWITCH_CABLE_ERROR;
// error("Cable at dim '%d' on Midplane %s, "
// "state went to %d, marking midplane down.",
// dim, ba_mp->coord_str, state.toValue());
// }
if (state == Hardware::Available) {
/* no change */
if (!(ba_mp->axis_switch[dim].usage & BG_SWITCH_CABLE_ERROR))
return;
ba_mp->axis_switch[dim].usage &= (~BG_SWITCH_CABLE_ERROR_SET);
info("Cable in dim '%u' on Midplane %s, "
"has returned to service",
dim, ba_mp->coord_str);
/* Don't resume any blocks in the error, Admins will
do this when they make sure it is ready. Really
only matters for static blocks. On a dynamic
system no block will be left around if a cable is bad.
*/
} else if (!(ba_mp->axis_switch[dim].usage & BG_SWITCH_CABLE_ERROR)) {
bg_record_t *bg_record = NULL, *smallest_bg_record = NULL;
ListIterator itr;
List delete_list = NULL;
ba_mp_t *next_ba_mp = ba_mp->next_mp[dim];
bool delete_it = 0;
ba_mp->axis_switch[dim].usage |= BG_SWITCH_CABLE_ERROR_SET;
error("Cable at dim '%d' on Midplane %s, "
"state went to %d, marking cable down.",
dim, ba_mp->coord_str, state.toValue());
/* Now handle potential overlapping blocks. */
if (bg_conf->layout_mode == LAYOUT_DYNAMIC)
delete_it = 1;
slurm_mutex_lock(&block_state_mutex);
delete_list = list_create(NULL);
itr = list_iterator_create(bg_lists->main);
while ((bg_record = (bg_record_t *)list_next(itr))) {
if (bg_record->mp_count == 1)
continue;
if (!bit_test(bg_record->mp_bitmap, ba_mp->index))
continue;
if (!bit_test(bg_record->mp_bitmap, next_ba_mp->index))
continue;
/* This block uses the wire so we need to take
* care of it. We only need to put one block
* in an error, so pick the smallest one.
*/
if ((bg_conf->layout_mode != LAYOUT_DYNAMIC)
&& (!smallest_bg_record
|| (smallest_bg_record->mp_count
> bg_record->mp_count)))
smallest_bg_record = bg_record;
list_push(delete_list, bg_record);
}
list_iterator_destroy(itr);
slurm_mutex_unlock(&block_state_mutex);
free_block_list(NO_VAL, delete_list, delete_it, 0);
list_destroy(delete_list);
if (smallest_bg_record) {
char reason[200];
snprintf(reason, sizeof(reason),
"Cable going from %s -> %s went into "
"an error state (%d).", ba_mp->coord_str,
next_ba_mp->coord_str, state.toValue());
put_block_in_error_state(smallest_bg_record, reason);
}
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment