From f6d42fdbb293ca89da609779db8d8c04a86a8d13 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Mon, 13 Feb 2017 11:27:59 -0700
Subject: [PATCH] Don't start job early

Insure job does not start running before node is booted and PrologSlurmctld
    is complete.
bug 3446
---
 NEWS                           |  2 ++
 src/slurmctld/job_mgr.c        | 11 +++++------
 src/slurmctld/job_scheduler.c  | 13 ++++++++-----
 src/slurmctld/node_scheduler.c |  2 +-
 src/slurmctld/power_save.c     |  9 ++++++++-
 src/slurmctld/slurmctld.h      |  4 ++++
 6 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/NEWS b/NEWS
index 71285be629d..4ca7ca1a4c7 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,8 @@ documents those changes that are of interest to users and administrators.
  -- burst_buffer/cray - Support default pool which is not the first pool
     reported by DataWarp and log in Slurm when pools that are added or removed
     from DataWarp.
+ -- Insure job does not start running before node is booted and PrologSlurmctld
+    is complete.
 
 * Changes in Slurm 16.05.9
 ==========================
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index c63cc1156a3..5ce62443236 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -7481,8 +7481,9 @@ extern void job_config_fini(struct job_record *job_ptr)
 	}
 }
 
-#ifndef HAVE_BG
-static bool _test_nodes_ready(struct job_record *job_ptr)
+/* Determine of the nodes are ready to run a job
+ * RET true if ready */
+extern bool test_job_nodes_ready(struct job_record *job_ptr)
 {
 	if (bit_overlap(job_ptr->node_bitmap, power_node_bitmap))
 		return false;
@@ -7504,7 +7505,6 @@ static bool _test_nodes_ready(struct job_record *job_ptr)
 
 	return true;
 }
-#endif
 
 /*
  * Modify a job's memory limit if allocated all memory on a node and the node
@@ -7576,7 +7576,7 @@ void job_time_limit(void)
 		if (job_ptr->details)
 			prolog = job_ptr->details->prolog_running;
 		if ((prolog == 0) && IS_JOB_CONFIGURING(job_ptr) &&
-		    _test_nodes_ready(job_ptr)) {
+		    test_job_nodes_ready(job_ptr)) {
 			info("%s: Configuration for job %u is complete",
 			      __func__, job_ptr->job_id);
 			job_config_fini(job_ptr);
@@ -12860,7 +12860,6 @@ job_alloc_info(uint32_t uid, uint32_t job_id, struct job_record **job_pptr)
 	    (prolog == 0) && job_ptr->node_bitmap &&
 	    (bit_overlap(power_node_bitmap, job_ptr->node_bitmap) == 0)) {
 		last_job_update = time(NULL);
-		job_ptr->job_state &= (~JOB_CONFIGURING);
 		set_job_alias_list(job_ptr);
 	}
 
@@ -13586,7 +13585,7 @@ extern int job_node_ready(uint32_t job_id, int *ready)
 	    job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") &&
 	    job_ptr->node_bitmap &&
 	    (bit_overlap(power_node_bitmap, job_ptr->node_bitmap) == 0)) {
-		job_ptr->job_state &= (~JOB_CONFIGURING);
+		last_job_update = time(NULL);
 		set_job_alias_list(job_ptr);
 	}
 
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index 33973e5c4ca..6ffbb8800b6 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -3933,11 +3933,14 @@ extern void prolog_running_decr(struct job_record *job_ptr)
 	    (--job_ptr->details->prolog_running > 0))
 		return;
 
-	job_ptr->job_state &= ~JOB_CONFIGURING;
-	if (job_ptr->batch_flag &&
-	    ((job_ptr->bit_flags & NODE_REBOOT) == 0) &&
-	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
-		launch_job(job_ptr);
+	if (IS_JOB_CONFIGURING(job_ptr) && test_job_nodes_ready(job_ptr)) {
+		info("%s: Configuration for job %u is complete",
+		      __func__, job_ptr->job_id);
+		job_config_fini(job_ptr);
+		if (job_ptr->batch_flag &&
+		    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
+			launch_job(job_ptr);
+		}
 	}
 }
 
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 87c2c68da86..bd622c00a4d 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -2609,8 +2609,8 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only,
 	power_g_job_start(job_ptr);
 
 	if (configuring ||
-	    bit_overlap(job_ptr->node_bitmap, power_node_bitmap) ||
 	    !bit_super_set(job_ptr->node_bitmap, avail_node_bitmap)) {
+		/* This handles nodes explicitly requesting node reboot */
 		job_ptr->job_state |= JOB_CONFIGURING;
 	}
 
diff --git a/src/slurmctld/power_save.c b/src/slurmctld/power_save.c
index 8f5d365978e..13494b9f625 100644
--- a/src/slurmctld/power_save.c
+++ b/src/slurmctld/power_save.c
@@ -296,8 +296,14 @@ extern int power_job_reboot(struct job_record *job_ptr)
 	pid_t pid;
 
 	boot_node_bitmap = node_features_reboot(job_ptr);
-	if (boot_node_bitmap == NULL)
+	if (boot_node_bitmap == NULL) {
+		/* Powered down nodes require reboot */
+		if (bit_overlap(power_node_bitmap, job_ptr->node_bitmap)) {
+			job_ptr->job_state |= JOB_CONFIGURING;
+			job_ptr->bit_flags |= NODE_REBOOT;
+		}
 		return SLURM_SUCCESS;
+	}
 
 	i_first = bit_ffs(boot_node_bitmap);
 	if (i_first >= 0)
@@ -322,6 +328,7 @@ extern int power_job_reboot(struct job_record *job_ptr)
 
 	nodes = bitmap2node_name(boot_node_bitmap);
 	if (nodes) {
+		/* Reboot nodes to change KNL NUMA and/or MCDRAM mode */
 		job_ptr->job_state |= JOB_CONFIGURING;
 		job_ptr->wait_all_nodes = 1;
 		job_ptr->bit_flags |= NODE_REBOOT;
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 043b033d808..abd2e8d8701 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -2196,6 +2196,10 @@ extern bool test_job_array_finished(uint32_t array_job_id);
 /* Return true if ANY tasks of specific array job ID are pending */
 extern bool test_job_array_pending(uint32_t array_job_id);
 
+/* Determine of the nodes are ready to run a job
+ * RET true if ready */
+extern bool test_job_nodes_ready(struct job_record *job_ptr);
+
 /*
  * Synchronize the batch job in the system with their files.
  * All pending batch jobs must have script and environment files
-- 
GitLab