Skip to content
Snippets Groups Projects
Commit fbbc4bdb authored by Morris Jette's avatar Morris Jette
Browse files

Fix cray requeue bug while step cleaning

Make sure no attempt is made to schedule a requeued job until all steps are
    cleaned (Node Health Check completes for all steps on a Cray).
bug 3082
parent cb7ed937
No related branches found
No related tags found
No related merge requests found
...@@ -76,6 +76,8 @@ documents those changes that are of interest to users and administrators. ...@@ -76,6 +76,8 @@ documents those changes that are of interest to users and administrators.
done. done.
-- Testsuite - Fix test1.83 to handle gaps in node names properly. -- Testsuite - Fix test1.83 to handle gaps in node names properly.
-- BlueGene - correctly scale node counts when enforcing MaxNodes limit. -- BlueGene - correctly scale node counts when enforcing MaxNodes limit.
-- Make sure no attempt is made to schedule a requeued job until all steps are
cleaned (Node Health Check completes for all steps on a Cray).
* Changes in Slurm 16.05.4 * Changes in Slurm 16.05.4
========================== ==========================
......
...@@ -195,6 +195,31 @@ static void _job_queue_rec_del(void *x) ...@@ -195,6 +195,31 @@ static void _job_queue_rec_del(void *x)
xfree(x); xfree(x);
} }
/* Return true if the job has some step still in a cleaning state, which
* can happen on a Cray if a job is requeued and the step NHC is still running
* after the requeued job is eligible to run again */
static uint16_t _is_step_cleaning(struct job_record *job_ptr)
{
ListIterator step_iterator;
struct step_record *step_ptr;
uint16_t cleaning = 0;
step_iterator = list_iterator_create(job_ptr->step_list);
while ((step_ptr = (struct step_record *) list_next (step_iterator))) {
/* Only check if not a pending step */
if (step_ptr->step_id != SLURM_PENDING_STEP) {
select_g_select_jobinfo_get(step_ptr->select_jobinfo,
SELECT_JOBDATA_CLEANING,
&cleaning);
if (cleaning)
break;
}
}
list_iterator_destroy(step_iterator);
return cleaning;
}
/* Job test for ability to run now, excludes partition specific tests */ /* Job test for ability to run now, excludes partition specific tests */
static bool _job_runnable_test1(struct job_record *job_ptr, bool sched_plugin) static bool _job_runnable_test1(struct job_record *job_ptr, bool sched_plugin)
{ {
...@@ -209,6 +234,8 @@ static bool _job_runnable_test1(struct job_record *job_ptr, bool sched_plugin) ...@@ -209,6 +234,8 @@ static bool _job_runnable_test1(struct job_record *job_ptr, bool sched_plugin)
select_g_select_jobinfo_get(job_ptr->select_jobinfo, select_g_select_jobinfo_get(job_ptr->select_jobinfo,
SELECT_JOBDATA_CLEANING, SELECT_JOBDATA_CLEANING,
&cleaning); &cleaning);
if (!cleaning)
cleaning = _is_step_cleaning(job_ptr);
if (cleaning || if (cleaning ||
(job_ptr->details && job_ptr->details->prolog_running) || (job_ptr->details && job_ptr->details->prolog_running) ||
(job_ptr->step_list && list_count(job_ptr->step_list))) { (job_ptr->step_list && list_count(job_ptr->step_list))) {
......
...@@ -1992,6 +1992,8 @@ static void _step_dealloc_lps(struct step_record *step_ptr) ...@@ -1992,6 +1992,8 @@ static void _step_dealloc_lps(struct step_record *step_ptr)
if (step_ptr->step_layout == NULL) /* batch step */ if (step_ptr->step_layout == NULL) /* batch step */
return; return;
if (job_resrcs_ptr == NULL)
return;
i_first = bit_ffs(job_resrcs_ptr->node_bitmap); i_first = bit_ffs(job_resrcs_ptr->node_bitmap);
i_last = bit_fls(job_resrcs_ptr->node_bitmap); i_last = bit_fls(job_resrcs_ptr->node_bitmap);
if (i_first == -1) /* empty bitmap */ if (i_first == -1) /* empty bitmap */
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment