From 6c9d4540514d5e0d92a0cdd210a75d70e210a4ed Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Mon, 28 Sep 2015 14:09:46 -0700 Subject: [PATCH] Fix for node state when shrinking jobs When nodes have been allocated to a job and then released by the job while resizing, this patch prevents the nodes from continuing to appear allocated and unavailable to other jobs. Requires exclusive node allocation to trigger. This prevents the previously reported failure, but a proper fix will be quite complex and delayed to the next major release of Slurm (v 16.05). bug 1851 --- NEWS | 3 +++ src/plugins/select/cons_res/select_cons_res.c | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/NEWS b/NEWS index 0b3af169339..235fa9ebfd0 100644 --- a/NEWS +++ b/NEWS @@ -24,6 +24,9 @@ documents those changes that are of interest to users and administrators. -- Correct counting for job array limits, job count limit underflow possible when master cancellation of master job record. -- For pending jobs have sacct print 0 for nnodes instead of the bogus 2. + -- Fix for tracking node state when jobs that have been allocated exclusive + access to nodes (i.e. entire nodes) and later relinquish some nodes. Nodes + would previously appear partly allocated and prevent use by other jobs. * Changes in Slurm 14.11.9 ========================== diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index b3fa9d99196..8fe0a9c9a06 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -1295,6 +1295,17 @@ static int _rm_job_from_one_node(struct job_record *job_ptr, if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) _dump_job_res(job); + if (job->whole_node) { + /* The node_bitmap remains set for this node, set but its entire + * core_bitmap will be cleared by clear_job_resources_node() + * below. Clear whole_node flag to prevent add_job_to_cores() + * from considering all cores on all allocated nodes as being + * allocated to this job. */ + verbose("%s: Clearing flag whole_node for job %u", + __func__, job_ptr->job_id); + job->whole_node = 0; + } + /* subtract memory */ node_inx = node_ptr - node_record_table_ptr; first_bit = bit_ffs(job->node_bitmap); -- GitLab