Skip to content
Snippets Groups Projects
Commit 86915005 authored by Danny Auble's avatar Danny Auble
Browse files

BGQ - sanity check for handling cnodes in error state on free blocks.

parent 98fd8b1e
No related branches found
No related tags found
No related merge requests found
......@@ -483,9 +483,40 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked)
rc = SLURM_SUCCESS;
if ((bg_record->state == BG_BLOCK_FREE)
|| (bg_record->state & BG_BLOCK_ERROR_FLAG))
|| (bg_record->state & BG_BLOCK_ERROR_FLAG)) {
if (bg_record->err_ratio
&& (bg_record->state == BG_BLOCK_FREE)) {
/* Sometime the realtime server can report
software error on cnodes even though the
block is free. If this is the case we need
to manually clear them.
*/
ba_mp_t *found_ba_mp;
ListIterator itr =
list_iterator_create(bg_record->ba_mp_list);
debug("block %s is free, but has %u cnodes in error",
bg_record->bg_block_id, bg_record->cnode_err_cnt);
while ((found_ba_mp = list_next(itr))) {
if (!found_ba_mp->used)
continue;
if (!found_ba_mp->cnode_err_bitmap)
found_ba_mp->cnode_err_bitmap =
bit_alloc(
bg_conf->mp_cnode_cnt);
bit_nclear(found_ba_mp->cnode_err_bitmap, 0,
bit_size(found_ba_mp->
cnode_err_bitmap)-1);
}
list_iterator_destroy(itr);
bg_record->cnode_err_cnt = 0;
bg_record->err_ratio = 0;
}
remove_from_bg_list(bg_lists->booted, bg_record);
else if (count >= MAX_FREE_RETRIES) {
} else if (count >= MAX_FREE_RETRIES) {
/* Something isn't right, go mark this one in an error
state. */
update_block_msg_t block_msg;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment