From 869150058820a0c510df5c31d7bb7a32e08020b4 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Thu, 21 Feb 2013 18:00:21 -0800 Subject: [PATCH] BGQ - sanity check for handling cnodes in error state on free blocks. --- src/plugins/select/bluegene/bg_core.c | 35 +++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/plugins/select/bluegene/bg_core.c b/src/plugins/select/bluegene/bg_core.c index 880bdf224de..c146b2c6a0a 100644 --- a/src/plugins/select/bluegene/bg_core.c +++ b/src/plugins/select/bluegene/bg_core.c @@ -483,9 +483,40 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) rc = SLURM_SUCCESS; if ((bg_record->state == BG_BLOCK_FREE) - || (bg_record->state & BG_BLOCK_ERROR_FLAG)) + || (bg_record->state & BG_BLOCK_ERROR_FLAG)) { + + if (bg_record->err_ratio + && (bg_record->state == BG_BLOCK_FREE)) { + /* Sometime the realtime server can report + software error on cnodes even though the + block is free. If this is the case we need + to manually clear them. + */ + ba_mp_t *found_ba_mp; + ListIterator itr = + list_iterator_create(bg_record->ba_mp_list); + debug("block %s is free, but has %u cnodes in error", + bg_record->bg_block_id, bg_record->cnode_err_cnt); + while ((found_ba_mp = list_next(itr))) { + if (!found_ba_mp->used) + continue; + + if (!found_ba_mp->cnode_err_bitmap) + found_ba_mp->cnode_err_bitmap = + bit_alloc( + bg_conf->mp_cnode_cnt); + + bit_nclear(found_ba_mp->cnode_err_bitmap, 0, + bit_size(found_ba_mp-> + cnode_err_bitmap)-1); + } + list_iterator_destroy(itr); + bg_record->cnode_err_cnt = 0; + bg_record->err_ratio = 0; + } + remove_from_bg_list(bg_lists->booted, bg_record); - else if (count >= MAX_FREE_RETRIES) { + } else if (count >= MAX_FREE_RETRIES) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg; -- GitLab