diff --git a/src/plugins/select/bluegene/bg_core.c b/src/plugins/select/bluegene/bg_core.c index 880bdf224de575030268dc2ade21e3604c7253d8..c146b2c6a0a04bc61604c1a0c691f4415351a27e 100644 --- a/src/plugins/select/bluegene/bg_core.c +++ b/src/plugins/select/bluegene/bg_core.c @@ -483,9 +483,40 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) rc = SLURM_SUCCESS; if ((bg_record->state == BG_BLOCK_FREE) - || (bg_record->state & BG_BLOCK_ERROR_FLAG)) + || (bg_record->state & BG_BLOCK_ERROR_FLAG)) { + + if (bg_record->err_ratio + && (bg_record->state == BG_BLOCK_FREE)) { + /* Sometime the realtime server can report + software error on cnodes even though the + block is free. If this is the case we need + to manually clear them. + */ + ba_mp_t *found_ba_mp; + ListIterator itr = + list_iterator_create(bg_record->ba_mp_list); + debug("block %s is free, but has %u cnodes in error", + bg_record->bg_block_id, bg_record->cnode_err_cnt); + while ((found_ba_mp = list_next(itr))) { + if (!found_ba_mp->used) + continue; + + if (!found_ba_mp->cnode_err_bitmap) + found_ba_mp->cnode_err_bitmap = + bit_alloc( + bg_conf->mp_cnode_cnt); + + bit_nclear(found_ba_mp->cnode_err_bitmap, 0, + bit_size(found_ba_mp-> + cnode_err_bitmap)-1); + } + list_iterator_destroy(itr); + bg_record->cnode_err_cnt = 0; + bg_record->err_ratio = 0; + } + remove_from_bg_list(bg_lists->booted, bg_record); - else if (count >= MAX_FREE_RETRIES) { + } else if (count >= MAX_FREE_RETRIES) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg;