From a02d04f11fd5edd96e665ee01c8a86c6f1137d9d Mon Sep 17 00:00:00 2001
From: Tim Wickberg <tim@schedmd.com>
Date: Thu, 25 May 2017 13:51:43 -0600
Subject: [PATCH] Prevent a race between completing jobs on a user-exclusive
 node from leaving the node owned.

Two jobs completing simultaneously leads to make_node_idle()
returning before it has a chance to decrement node_ptr->owner_job_cnt,
which can result in the node being "owned" by that user even
through no jobs are running on it.

Move the decrement block to the end at a fini label, and make sure
all return paths pass through it. While moving that add a guard
against node_ptr->owner_job_cnt underflowing.

Bug 3771.
---
 NEWS                     |  2 ++
 src/slurmctld/node_mgr.c | 29 +++++++++++++++++------------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/NEWS b/NEWS
index 9566a589e13..7eefeee0db6 100644
--- a/NEWS
+++ b/NEWS
@@ -53,6 +53,8 @@ documents those changes that are of interest to users and administrators.
  -- Fix WithSubAccounts option to not include WithDeleted unless requested.
  -- Prevent a job tested on multiple partitions from being marked
     WHOLE_NODE_USER.
+ -- Prevent a race between completing jobs on a user-exclusive node from
+    leaving the node owned.
 
 * Changes in Slurm 17.02.3
 ==========================
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index b0f8025fdee..9369e7fb323 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -3777,7 +3777,7 @@ void make_node_idle(struct node_record *node_ptr,
 				      node_ptr->name);
 			}
 			if (node_ptr->comp_job_cnt > 0)
-				return;		/* More jobs completing */
+				goto fini;	/* More jobs completing */
 		}
 	}
 
@@ -3789,23 +3789,13 @@ void make_node_idle(struct node_record *node_ptr,
 			xfree(node_ptr->mcs_label);
 		}
 	}
-	if (job_ptr &&
-	    ((job_ptr->details &&
-	      (job_ptr->details->whole_node == WHOLE_NODE_USER)) ||
-	     (job_ptr->part_ptr &&
-	      (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)))) {
-		if (--node_ptr->owner_job_cnt == 0) {
-			node_ptr->owner = NO_VAL;
-			xfree(node_ptr->mcs_label);
-		}
-	}
 
 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
 	if (IS_NODE_DOWN(node_ptr)) {
 		debug3("%s: %s node %s being left DOWN",
 		       __func__, jobid2str(job_ptr, jbuf,
 					   sizeof(jbuf)), node_ptr->name);
-		return;
+		goto fini;
 	}
 	bit_set(up_node_bitmap, inx);
 
@@ -3842,6 +3832,21 @@ void make_node_idle(struct node_record *node_ptr,
 			bit_set(idle_node_bitmap, inx);
 		node_ptr->last_idle = now;
 	}
+
+fini:
+	if (job_ptr &&
+	    ((job_ptr->details &&
+	      (job_ptr->details->whole_node == WHOLE_NODE_USER)) ||
+	     (job_ptr->part_ptr &&
+	      (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)))) {
+		if (node_ptr->owner_job_cnt == 0) {
+			error("%s: node_ptr->owner_job_cnt underflow",
+			      __func__);
+		} else if (--node_ptr->owner_job_cnt == 0) {
+			node_ptr->owner = NO_VAL;
+			xfree(node_ptr->mcs_label);
+		}
+	}
 	last_node_update = now;
 }
 
-- 
GitLab