From a02d04f11fd5edd96e665ee01c8a86c6f1137d9d Mon Sep 17 00:00:00 2001 From: Tim Wickberg <tim@schedmd.com> Date: Thu, 25 May 2017 13:51:43 -0600 Subject: [PATCH] Prevent a race between completing jobs on a user-exclusive node from leaving the node owned. Two jobs completing simultaneously leads to make_node_idle() returning before it has a chance to decrement node_ptr->owner_job_cnt, which can result in the node being "owned" by that user even through no jobs are running on it. Move the decrement block to the end at a fini label, and make sure all return paths pass through it. While moving that add a guard against node_ptr->owner_job_cnt underflowing. Bug 3771. --- NEWS | 2 ++ src/slurmctld/node_mgr.c | 29 +++++++++++++++++------------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/NEWS b/NEWS index 9566a589e13..7eefeee0db6 100644 --- a/NEWS +++ b/NEWS @@ -53,6 +53,8 @@ documents those changes that are of interest to users and administrators. -- Fix WithSubAccounts option to not include WithDeleted unless requested. -- Prevent a job tested on multiple partitions from being marked WHOLE_NODE_USER. + -- Prevent a race between completing jobs on a user-exclusive node from + leaving the node owned. * Changes in Slurm 17.02.3 ========================== diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index b0f8025fdee..9369e7fb323 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -3777,7 +3777,7 @@ void make_node_idle(struct node_record *node_ptr, node_ptr->name); } if (node_ptr->comp_job_cnt > 0) - return; /* More jobs completing */ + goto fini; /* More jobs completing */ } } @@ -3789,23 +3789,13 @@ void make_node_idle(struct node_record *node_ptr, xfree(node_ptr->mcs_label); } } - if (job_ptr && - ((job_ptr->details && - (job_ptr->details->whole_node == WHOLE_NODE_USER)) || - (job_ptr->part_ptr && - (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)))) { - if (--node_ptr->owner_job_cnt == 0) { - node_ptr->owner = NO_VAL; - xfree(node_ptr->mcs_label); - } - } node_flags = node_ptr->node_state & NODE_STATE_FLAGS; if (IS_NODE_DOWN(node_ptr)) { debug3("%s: %s node %s being left DOWN", __func__, jobid2str(job_ptr, jbuf, sizeof(jbuf)), node_ptr->name); - return; + goto fini; } bit_set(up_node_bitmap, inx); @@ -3842,6 +3832,21 @@ void make_node_idle(struct node_record *node_ptr, bit_set(idle_node_bitmap, inx); node_ptr->last_idle = now; } + +fini: + if (job_ptr && + ((job_ptr->details && + (job_ptr->details->whole_node == WHOLE_NODE_USER)) || + (job_ptr->part_ptr && + (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)))) { + if (node_ptr->owner_job_cnt == 0) { + error("%s: node_ptr->owner_job_cnt underflow", + __func__); + } else if (--node_ptr->owner_job_cnt == 0) { + node_ptr->owner = NO_VAL; + xfree(node_ptr->mcs_label); + } + } last_node_update = now; } -- GitLab