From 869150058820a0c510df5c31d7bb7a32e08020b4 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@schedmd.com>
Date: Thu, 21 Feb 2013 18:00:21 -0800
Subject: [PATCH] BGQ - sanity check for handling cnodes in error state on free
 blocks.

---
 src/plugins/select/bluegene/bg_core.c | 35 +++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/plugins/select/bluegene/bg_core.c b/src/plugins/select/bluegene/bg_core.c
index 880bdf224de..c146b2c6a0a 100644
--- a/src/plugins/select/bluegene/bg_core.c
+++ b/src/plugins/select/bluegene/bg_core.c
@@ -483,9 +483,40 @@ extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked)
 
 	rc = SLURM_SUCCESS;
 	if ((bg_record->state == BG_BLOCK_FREE)
-	    || (bg_record->state & BG_BLOCK_ERROR_FLAG))
+	    || (bg_record->state & BG_BLOCK_ERROR_FLAG)) {
+
+		if (bg_record->err_ratio
+		    && (bg_record->state == BG_BLOCK_FREE)) {
+			/* Sometime the realtime server can report
+			   software error on cnodes even though the
+			   block is free.  If this is the case we need
+			   to manually clear them.
+			*/
+			ba_mp_t *found_ba_mp;
+			ListIterator itr =
+				list_iterator_create(bg_record->ba_mp_list);
+			debug("block %s is free, but has %u cnodes in error",
+			      bg_record->bg_block_id, bg_record->cnode_err_cnt);
+			while ((found_ba_mp = list_next(itr))) {
+				if (!found_ba_mp->used)
+					continue;
+
+				if (!found_ba_mp->cnode_err_bitmap)
+					found_ba_mp->cnode_err_bitmap =
+						bit_alloc(
+							bg_conf->mp_cnode_cnt);
+
+				bit_nclear(found_ba_mp->cnode_err_bitmap, 0,
+					   bit_size(found_ba_mp->
+						    cnode_err_bitmap)-1);
+			}
+			list_iterator_destroy(itr);
+			bg_record->cnode_err_cnt = 0;
+			bg_record->err_ratio = 0;
+		}
+
 		remove_from_bg_list(bg_lists->booted, bg_record);
-	else if (count >= MAX_FREE_RETRIES) {
+	} else if (count >= MAX_FREE_RETRIES) {
 		/* Something isn't right, go mark this one in an error
 		   state. */
 		update_block_msg_t block_msg;
-- 
GitLab