diff --git a/NEWS b/NEWS index 387d4e4664e61a0b1747b7ca26c70b9542739005..35e3614caa7cf34364445a7b03a13bf61d48b65f 100644 --- a/NEWS +++ b/NEWS @@ -65,6 +65,10 @@ documents those changes that are of interest to users and admins. is always set when sending or receiving a message. -- Reset backfilled job counter only when explicitly cleared using scontrol. Patch from Alejandro Lucero Palau, BSC. + -- BLUEGENE - Fix for handling blocks when a larger block will not free and + while it is attempting to free underlying hardware is marked in error + making small blocks overlapping with the freeing block. This only + applies to dynamic layout mode. * Changes in SLURM 2.4.1 ======================== diff --git a/src/plugins/select/bluegene/bg_record_functions.c b/src/plugins/select/bluegene/bg_record_functions.c index 7d35a9bcbb3bbe2fbf3cf885f9a71928ef364e64..83efc4601449bcf271dcc51e739d196548c513cd 100644 --- a/src/plugins/select/bluegene/bg_record_functions.c +++ b/src/plugins/select/bluegene/bg_record_functions.c @@ -1305,7 +1305,6 @@ extern int down_nodecard(char *mp_name, bitoff_t io_start, if (!blocks_overlap(bg_record, found_record)) continue; list_push(delete_list, found_record); - list_remove(itr); } list_iterator_destroy(itr); diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index ea596e9de6642a1d83e68ac0b3693bcbfcd73303..c1d2bec684efa80a4ab7f450abe6e65d9bae57a1 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -2365,6 +2365,8 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr) bg_record = find_bg_record_in_list(bg_lists->main, block_desc_ptr->bg_block_id); if (!bg_record) { + error("update_block: block %s not found", + block_desc_ptr->bg_block_id); slurm_mutex_unlock(&block_state_mutex); return ESLURM_INVALID_BLOCK_NAME; } @@ -2448,6 +2450,8 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr) if (block_desc_ptr->state == BG_BLOCK_ERROR_FLAG) { bg_record_t *found_record = NULL; List delete_list = list_create(NULL); + bool delete_it = 0; + /* This loop shouldn't do much in regular Dynamic mode since there shouldn't be overlapped blocks. But if there is a trouble block that isn't going away and @@ -2494,11 +2498,14 @@ extern int select_p_update_block(update_block_msg_t *block_desc_ptr) found_record->bg_block_id, bg_record->bg_block_id); } + resume_block(found_record); list_push(delete_list, found_record); } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); - free_block_list(NO_VAL, delete_list, 0, 0); + if (bg_conf->layout_mode == LAYOUT_DYNAMIC) + delete_it = 1; + free_block_list(NO_VAL, delete_list, delete_it, 0); list_destroy(delete_list); put_block_in_error_state(bg_record, reason); } else if (block_desc_ptr->state == BG_BLOCK_FREE) { diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c index 3e71928d1ae545c488c0849fccaa517cef8b67e2..c81826beba67a2991b094932e78b384b595c2b6b 100644 --- a/src/slurmd/slurmstepd/slurmstepd.c +++ b/src/slurmd/slurmstepd/slurmstepd.c @@ -285,8 +285,7 @@ static int handle_spank_mode (int argc, char *argv[]) if (get_jobid_uid_from_env (&jobid, &uid) < 0) return error ("spank environment invalid"); - verbose ("Running spank/%s for jobid [%u] uid [%u]", - mode, jobid, uid); + debug("Running spank/%s for jobid [%u] uid [%u]", mode, jobid, uid); if (strcmp (mode, "prolog") == 0) { if (spank_job_prolog (jobid, uid) < 0)