From f710a33ea312035ef46d43b64c632e54b59b24a1 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Tue, 21 Feb 2012 16:44:09 -0800 Subject: [PATCH] BGQ - fixed deadlock on system when a midplane goes missing. --- .../select/bluegene/bl_bgq/bridge_status.cc | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc index 946eeb0490f..2a3c5844504 100644 --- a/src/plugins/select/bluegene/bl_bgq/bridge_status.cc +++ b/src/plugins/select/bluegene/bl_bgq/bridge_status.cc @@ -149,7 +149,8 @@ static void _bridge_status_disconnect() } static void _handle_bad_midplane(const char *mp_coords, - EnumWrapper<Hardware::State> state) + EnumWrapper<Hardware::State> state, + bool block_state_locked) { char bg_down_node[128]; @@ -162,15 +163,24 @@ static void _handle_bad_midplane(const char *mp_coords, error("Midplane %s, state went to '%s', marking midplane down.", bg_down_node, bridge_hardware_state_string(state.toValue())); + /* unlock mutex here since slurm_drain_nodes could produce + deadlock */ + slurm_mutex_unlock(&ba_system_mutex); + if (block_state_locked) + slurm_mutex_unlock(&block_state_mutex); slurm_drain_nodes( bg_down_node, (char *)"select_bluegene: MMCS midplane not UP", slurm_get_slurm_user_id()); + if (block_state_locked) + slurm_mutex_lock(&block_state_mutex); + slurm_mutex_lock(&ba_system_mutex); } } static void _handle_bad_switch(int dim, const char *mp_coords, - EnumWrapper<Hardware::State> state) + EnumWrapper<Hardware::State> state, + bool block_state_locked) { char bg_down_node[128]; @@ -184,9 +194,17 @@ static void _handle_bad_switch(int dim, const char *mp_coords, "marking midplane down.", dim, bg_down_node, bridge_hardware_state_string(state.toValue())); + /* unlock mutex here since slurm_drain_nodes could produce + deadlock */ + slurm_mutex_unlock(&ba_system_mutex); + if (block_state_locked) + slurm_mutex_unlock(&block_state_mutex); slurm_drain_nodes(bg_down_node, (char *)"select_bluegene: MMCS switch not UP", slurm_get_slurm_user_id()); + if (block_state_locked) + slurm_mutex_lock(&block_state_mutex); + slurm_mutex_lock(&ba_system_mutex); } } @@ -715,7 +733,7 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq, } if (mp_ptr->getState() != Hardware::Available) { - _handle_bad_midplane(ba_mp->coord_str, mp_ptr->getState()); + _handle_bad_midplane(ba_mp->coord_str, mp_ptr->getState(), 1); /* no reason to continue */ return; } else { @@ -753,7 +771,7 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq, if (switch_ptr->getState() != Hardware::Available) _handle_bad_switch(dim, ba_mp->coord_str, - switch_ptr->getState()); + switch_ptr->getState(), 1); else { Cable::ConstPtr my_cable = switch_ptr->getCable(); @@ -945,7 +963,7 @@ void event_handler::handleMidplaneStateChangedRealtimeEvent( } /* Else mark the midplane down */ - _handle_bad_midplane(ba_mp->coord_str, event.getState()); + _handle_bad_midplane(ba_mp->coord_str, event.getState(), 0); slurm_mutex_unlock(&ba_system_mutex); return; @@ -990,7 +1008,7 @@ void event_handler::handleSwitchStateChangedRealtimeEvent( } /* Else mark the midplane down */ - _handle_bad_switch(dim, ba_mp->coord_str, event.getState()); + _handle_bad_switch(dim, ba_mp->coord_str, event.getState(), 0); slurm_mutex_unlock(&ba_system_mutex); return; -- GitLab