diff --git a/NEWS b/NEWS index 9566a589e133358e88853e423f241f5717d5d534..7eefeee0db60377e30bdb727822c5cb1bc1e77a7 100644 --- a/NEWS +++ b/NEWS @@ -53,6 +53,8 @@ documents those changes that are of interest to users and administrators. -- Fix WithSubAccounts option to not include WithDeleted unless requested. -- Prevent a job tested on multiple partitions from being marked WHOLE_NODE_USER. + -- Prevent a race between completing jobs on a user-exclusive node from + leaving the node owned. * Changes in Slurm 17.02.3 ========================== diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index b0f8025fdee74b434d5f310f4cd7583ba13f6cb8..9369e7fb323eab2792b5254ca616f93323ab17b6 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -3777,7 +3777,7 @@ void make_node_idle(struct node_record *node_ptr, node_ptr->name); } if (node_ptr->comp_job_cnt > 0) - return; /* More jobs completing */ + goto fini; /* More jobs completing */ } } @@ -3789,23 +3789,13 @@ void make_node_idle(struct node_record *node_ptr, xfree(node_ptr->mcs_label); } } - if (job_ptr && - ((job_ptr->details && - (job_ptr->details->whole_node == WHOLE_NODE_USER)) || - (job_ptr->part_ptr && - (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)))) { - if (--node_ptr->owner_job_cnt == 0) { - node_ptr->owner = NO_VAL; - xfree(node_ptr->mcs_label); - } - } node_flags = node_ptr->node_state & NODE_STATE_FLAGS; if (IS_NODE_DOWN(node_ptr)) { debug3("%s: %s node %s being left DOWN", __func__, jobid2str(job_ptr, jbuf, sizeof(jbuf)), node_ptr->name); - return; + goto fini; } bit_set(up_node_bitmap, inx); @@ -3842,6 +3832,21 @@ void make_node_idle(struct node_record *node_ptr, bit_set(idle_node_bitmap, inx); node_ptr->last_idle = now; } + +fini: + if (job_ptr && + ((job_ptr->details && + (job_ptr->details->whole_node == WHOLE_NODE_USER)) || + (job_ptr->part_ptr && + (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)))) { + if (node_ptr->owner_job_cnt == 0) { + error("%s: node_ptr->owner_job_cnt underflow", + __func__); + } else if (--node_ptr->owner_job_cnt == 0) { + node_ptr->owner = NO_VAL; + xfree(node_ptr->mcs_label); + } + } last_node_update = now; }