From 4ce626789dbfff156254345362abb54ebda92784 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Thu, 26 May 2016 16:16:51 -0700 Subject: [PATCH] Fix for tracking a node's allocated CPUs with gang scheduling. This bug was introduced by commit 21c52d2f61e8086209d0c4d18f4700c07588ead9 which fixed a different problem tracking resources associated with suspended jobs. There are subtle differences between jobs that are suspended by a user/administrator and jobs suspended by gang scheduling which resulted in undercounting allocated CPUs when a job suspended by gang scheduling was active at the same time of a slurmctld reconfiguration request. See bugs 2353 (original bug related to commit 21c52d2f61e8086209d0c4d18f4700c07588ead9 and bug 2765 --- NEWS | 1 + src/plugins/select/cons_res/select_cons_res.c | 14 ++++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 201d7376544..a383bc966a3 100644 --- a/NEWS +++ b/NEWS @@ -41,6 +41,7 @@ documents those changes that are of interest to users and administrators. -- Make it so --mail-type=NONE didn't throw an invalid error. -- If no default account is given for a user when creating (only a list of accounts) no default account is printed, previously NULL was printed. + -- Fix for tracking a node's allocated CPUs with gang scheduling. * Changes in Slurm 15.08.11 =========================== diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 3c8714acad1..89aca1ab0e4 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -2535,9 +2535,12 @@ extern int select_p_select_nodeinfo_set(struct job_record *job_ptr) if (IS_JOB_RUNNING(job_ptr)) rc = _add_job_to_res(job_ptr, 0); - else if (IS_JOB_SUSPENDED(job_ptr)) - rc = _add_job_to_res(job_ptr, 1); - else + else if (IS_JOB_SUSPENDED(job_ptr)) { + if (job_ptr->priority == 0) + rc = _add_job_to_res(job_ptr, 1); + else /* Gang schedule suspend */ + rc = _add_job_to_res(job_ptr, 0); + } else return SLURM_SUCCESS; gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id); @@ -2743,7 +2746,10 @@ extern int select_p_reconfigure(void) _add_job_to_res(job_ptr, 0); } else if (IS_JOB_SUSPENDED(job_ptr)) { /* add the job in a suspended state */ - _add_job_to_res(job_ptr, 1); + if (job_ptr->priority == 0) + rc = _add_job_to_res(job_ptr, 1); + else /* Gang schedule suspend */ + rc = _add_job_to_res(job_ptr, 0); } else if (_job_cleaning(job_ptr)) { cleaning_job_cnt++; run_time = (int) difftime(now, job_ptr->end_time); -- GitLab