From 523d6a77f54baf3f94c479089ed84f9264287f4c Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 28 Apr 2010 20:09:46 +0000
Subject: [PATCH] Modify srun and salloc so that after creating a resource
 allocation, they     wait for all allocated nodes to power up before
 proceeding. Salloc will     log the delay with the messages "Waiting for
 nodes to boot" and "Nodes are     ready for use". Srun will generate the same
 messages only if the --verbose     option is used.

---
 NEWS                                          |  5 ++
 doc/html/power_save.shtml                     | 29 +++++---
 src/api/job_info.c                            |  4 +-
 src/common/slurm_protocol_api.c               | 17 +++++
 src/common/slurm_protocol_api.h               |  5 ++
 src/plugins/select/cons_res/select_cons_res.c | 19 ++++-
 src/plugins/select/linear/select_linear.c     | 20 ++++-
 src/salloc/salloc.c                           | 71 +++++++++++++++++-
 src/slurmctld/job_mgr.c                       | 13 ++--
 src/srun/allocate.c                           | 73 +++++++++++++++++--
 10 files changed, 221 insertions(+), 35 deletions(-)

diff --git a/NEWS b/NEWS
index 419b09ae562..61e1c51417d 100644
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,11 @@ documents those changes that are of interest to users and admins.
  -- sview - There is now a .slurm/sviewrc created when runnning sview.
     Defaults are put in there as to how sview looks when first launched.
     You can set these by Ctrl-S or Options->Set Default Settings.
+ -- Modify srun and salloc so that after creating a resource allocation, they
+    wait for all allocated nodes to power up before proceeding. Salloc will
+    log the delay with the messages "Waiting for nodes to boot" and "Nodes are
+    ready for use". Srun will generate the same messages only if the --verbose
+    option is used.
 
 * Changes in SLURM 2.2.0.pre5
 =============================
diff --git a/doc/html/power_save.shtml b/doc/html/power_save.shtml
index 0aec4b93130..69e6ff52583 100644
--- a/doc/html/power_save.shtml
+++ b/doc/html/power_save.shtml
@@ -173,7 +173,8 @@ nodes are in power save mode using messages of this sort:
 
 <p>Using these logs you can easily see the effect of SLURM's power saving
 support.
-You can also configure SLURM with programs that perform no action as <b>SuspendProgram</b> and <b>ResumeProgram</b> to assess the potential
+You can also configure SLURM with programs that perform no action as
+<b>SuspendProgram</b> and <b>ResumeProgram</b> to assess the potential
 impact of power saving mode before enabling it.</p>
 
 <h2>Use of Allocations</h2>
@@ -189,16 +190,20 @@ available).</p>
 <p>In the case of an <i>sbatch</i> command, the batch program will start
 when node zero of the allocation is ready for use and pre-processing can
 be performed as needed before using <i>srun</i> to launch job steps.
-The operation of <i>salloc</i> and <i>srun</i> follow a similar pattern
-of getting an job allocation at one time, but possibly being unable to
-launch job steps until later.
-If <i>ssh</i> or some other tools is used by <i>salloc</i> it may be
-desirable to execute "<i>srun /bin/true</i>" or some other command
-first to insure that all nodes are booted and ready for use.
-We plan to add a job and node state of <i>CONFIGURING</i> in SLURM
-version 2.1, which could be used to prevent salloc from executing
-any processes (including <i>ssh</i>) until all of the nodes are
-ready for use.</p>
+Waiting for all nodes to be booted can be accomplished by adding the 
+command "<i>scontrol waitjob $SLURM_JOBID</i>" within the script or by
+adding that command to the the system <i>Prolog</i> as configured in 
+<i>slurm.conf</i>, which would create the delay for all jobs on the system.
+Note that the <i>scontrol waitjob</i> command was added to SLURM version 2.2.
+When using earlier versions of SLURM, one may execute "<i>srun /bin/true</i>"
+or some other command first to insure that all nodes are booted and ready
+for use.</p>
+
+<p>The <i>salloc</i> and <i>srun</i> commands which create a resource
+allocation automatically wait for the nodes to power up in SLURM version 2.2.
+When using earlier versions of SLURM, <i>salloc</i> will return immediately
+after a resource allocation is made and one can execute "<i>srun /bin/true</i>"
+to insure that all nodes are booted and ready for use.</p>
 
 <h2>Fault Tolerance</h2>
 
@@ -239,6 +244,6 @@ and perform the following actions:
 <li>Boot the appropriate image for each node</li>
 </ol>
 
-<p style="text-align:center;">Last modified 6 August 2009</p>
+<p style="text-align:center;">Last modified 28 April 2010</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/src/api/job_info.c b/src/api/job_info.c
index f42621da8be..d056d9d1f83 100644
--- a/src/api/job_info.c
+++ b/src/api/job_info.c
@@ -1083,8 +1083,8 @@ extern int slurm_job_node_ready(uint32_t job_id)
 	} else if (resp.msg_type == RESPONSE_SLURM_RC) {
 		int job_rc = ((return_code_msg_t *) resp.data) ->
 				return_code;
-		if ((job_rc == ESLURM_INVALID_PARTITION_NAME)
-		||  (job_rc == ESLURM_INVALID_JOB_ID))
+		if ((job_rc == ESLURM_INVALID_PARTITION_NAME) ||
+		    (job_rc == ESLURM_INVALID_JOB_ID))
 			rc = READY_JOB_FATAL;
 		else	/* EAGAIN */
 			rc = READY_JOB_ERROR;
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index 4fb4f71a6a5..7255a7e96ac 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -228,6 +228,23 @@ uint16_t slurm_get_batch_start_timeout(void)
 	return batch_start_timeout;
 }
 
+/* slurm_get_suspend_timeout
+ * RET SuspendTimeout value from slurm.conf
+ */
+uint16_t slurm_get_suspend_timeout(void)
+{
+        uint16_t suspend_timeout = 0;
+        slurm_ctl_conf_t *conf;
+
+        if(slurmdbd_conf) {
+        } else {
+                conf = slurm_conf_lock();
+                suspend_timeout = conf->suspend_timeout;
+                slurm_conf_unlock();
+        }
+        return suspend_timeout;
+}
+
 /* slurm_get_resume_timeout
  * RET ResumeTimeout value from slurm.conf
  */
diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h
index 3a0d22db8a4..22d4521773d 100644
--- a/src/common/slurm_protocol_api.h
+++ b/src/common/slurm_protocol_api.h
@@ -103,6 +103,11 @@ inline slurm_protocol_config_t *slurm_get_api_config(void);
  */
 uint16_t slurm_get_batch_start_timeout(void);
 
+/* slurm_get_suspend_timeout
+ * RET SuspendTimeout value from slurm.conf
+ */
+uint16_t slurm_get_suspend_timeout(void);
+
 /* slurm_get_resume_timeout
  * RET ResumeTimeout value from slurm.conf
  */
diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c
index 2b195310054..ff12c176ab4 100644
--- a/src/plugins/select/cons_res/select_cons_res.c
+++ b/src/plugins/select/cons_res/select_cons_res.c
@@ -1820,9 +1820,26 @@ extern int select_p_job_begin(struct job_record *job_ptr)
 	return SLURM_SUCCESS;
 }
 
+/* Determine if allocated nodes are usable (powered up) */
 extern int select_p_job_ready(struct job_record *job_ptr)
 {
-	return SLURM_SUCCESS;
+	int i, i_first, i_last;
+	struct node_record *node_ptr;
+
+	if ((job_ptr->node_bitmap == NULL) ||
+	    ((i_first = bit_ffs(job_ptr->node_bitmap)) == -1))
+		return READY_NODE_STATE;
+	i_last  = bit_fls(job_ptr->node_bitmap);
+
+	for (i=i_first; i<=i_last; i++) {
+		if (bit_test(job_ptr->node_bitmap, i) == 0)
+			continue;
+		node_ptr = node_record_table_ptr + i;
+		if (IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr))
+			return 0;
+	}
+
+	return READY_NODE_STATE;
 }
 
 extern int select_p_job_resized(struct job_record *job_ptr,
diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c
index ed51ae46054..8d30ed5738c 100644
--- a/src/plugins/select/linear/select_linear.c
+++ b/src/plugins/select/linear/select_linear.c
@@ -2339,12 +2339,26 @@ extern int select_p_job_begin(struct job_record *job_ptr)
 	return rc;
 }
 
+/* Determine if allocated nodes are usable (powered up) */
 extern int select_p_job_ready(struct job_record *job_ptr)
 {
-	if (!IS_JOB_RUNNING(job_ptr))
-		return 0;
+	int i, i_first, i_last;
+	struct node_record *node_ptr;
+
+	if ((job_ptr->node_bitmap == NULL) ||
+	    ((i_first = bit_ffs(job_ptr->node_bitmap)) == -1))
+		return READY_NODE_STATE;
+	i_last  = bit_fls(job_ptr->node_bitmap);
+
+	for (i=i_first; i<=i_last; i++) {
+		if (bit_test(job_ptr->node_bitmap, i) == 0)
+			continue;
+		node_ptr = node_record_table_ptr + i;
+		if (IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr))
+			return 0;
+	}
 
-	return 1;
+	return READY_NODE_STATE;
 }
 
 extern int select_p_job_resized(struct job_record *job_ptr,
diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c
index c93fec1859f..8d36fb961dc 100644
--- a/src/salloc/salloc.c
+++ b/src/salloc/salloc.c
@@ -77,7 +77,8 @@
 #include "src/common/node_select.h"
 #endif
 
-#define MAX_RETRIES 3
+#define MAX_RETRIES	10
+#define POLL_SLEEP	3	/* retry interval in seconds  */
 
 char **command_argv;
 int command_argc;
@@ -109,10 +110,11 @@ static void _ping_handler(srun_ping_msg_t *msg);
 static void _node_fail_handler(srun_node_fail_msg_t *msg);
 
 #ifdef HAVE_BG
-#define POLL_SLEEP 3			/* retry interval in seconds  */
 static int _wait_bluegene_block_ready(
 			resource_allocation_response_msg_t *alloc);
 static int _blocks_dealloc(void);
+#else
+static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc);
 #endif
 
 #ifdef HAVE_CRAY_XT
@@ -267,6 +269,13 @@ int main(int argc, char *argv[])
 				      "boot of the block.");
 			goto relinquish;
 		}
+#else
+		if (!_wait_nodes_ready(alloc)) {
+			if(!allocation_interrupted)
+				error("Something is wrong with the "
+				      "boot of the nodes.");
+			goto relinquish;
+		}	
 #endif
 #ifdef HAVE_CRAY_XT
 		if (!_claim_reservation(alloc)) {
@@ -774,7 +783,7 @@ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc)
 
 		if (rc == READY_JOB_FATAL)
 			break;				/* fatal error */
-		if (rc == READY_JOB_ERROR)		/* error */
+		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
 			continue;			/* retry */
 		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
 			break;
@@ -824,7 +833,7 @@ static int _blocks_dealloc(void)
 	}
 
 	if (error_code) {
-		error("slurm_load_partitions: %s\n",
+		error("slurm_load_partitions: %s",
 		      slurm_strerror(slurm_get_errno()));
 		return -1;
 	}
@@ -838,6 +847,60 @@ static int _blocks_dealloc(void)
 	bg_info_ptr = new_bg_ptr;
 	return rc;
 }
+#else
+/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */
+static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc)
+{
+	int is_ready = 0, i, rc;
+	int cur_delay = 0;
+	int suspend_time, resume_time, max_delay;
+
+	suspend_time = slurm_get_suspend_timeout();
+	resume_time  = slurm_get_resume_timeout();
+	if ((suspend_time == 0) || (resume_time == 0))
+		return 1;	/* Power save mode disabled */
+	max_delay = suspend_time + resume_time;
+	max_delay *= 5;		/* Allow for ResumeRate support */
+
+	pending_job_id = alloc->job_id;
+
+	for (i=0; (cur_delay < max_delay); i++) {
+		if (i) {
+			if (i == 1)
+				info("Waiting for nodes to boot");
+			else
+				debug("still waiting");
+			sleep(POLL_SLEEP);
+			cur_delay += POLL_SLEEP;
+		}
+
+		rc = slurm_job_node_ready(alloc->job_id);
+
+		if (rc == READY_JOB_FATAL)
+			break;				/* fatal error */
+		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
+			continue;			/* retry */
+		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
+			break;
+		if (rc & READY_NODE_STATE) {		/* job and node ready */
+			is_ready = 1;
+			break;
+		}
+		if (allocation_interrupted)
+			break;
+	}
+	if (is_ready) {
+		if (i > 0)
+     			info ("Nodes %s are ready for job", alloc->node_list);
+	} else if (!allocation_interrupted)
+		error("Nodes %s are still not ready", alloc->node_list);
+	else	/* allocation_interrupted or slurmctld not responing */
+		is_ready = 0;
+
+	pending_job_id = 0;
+
+	return is_ready;
+}
 #endif	/* HAVE_BG */
 
 #ifdef HAVE_CRAY_XT
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 11003807097..e2a16b392a0 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -1812,18 +1812,16 @@ extern int kill_running_job_by_node_name(char *node_name)
 				}
 				job_ptr->restart_cnt++;
 				/* Since the job completion logger
-				   removes the submit we need to add it
-				   again.
-				*/
+				 * removes the submit we need to add it
+				 * again. */
 				acct_policy_add_job_submit(job_ptr);
 			} else {
 				info("Killing job_id %u on failed node %s",
 				     job_ptr->job_id, node_name);
 				srun_node_fail(job_ptr->job_id, node_name);
 				job_ptr->job_state = JOB_NODE_FAIL |
-					JOB_COMPLETING;
-				job_ptr->exit_code =
-					MAX(job_ptr->exit_code, 1);
+						     JOB_COMPLETING;
+				job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
 				job_ptr->state_reason = FAIL_DOWN_NODE;
 				xfree(job_ptr->state_desc);
 				if (suspended) {
@@ -1831,8 +1829,7 @@ extern int kill_running_job_by_node_name(char *node_name)
 						job_ptr->suspend_time;
 					job_ptr->tot_sus_time +=
 						difftime(now,
-							 job_ptr->
-							 suspend_time);
+							 job_ptr->suspend_time);
 				} else
 					job_ptr->end_time = time(NULL);
 				deallocate_nodes(job_ptr, false, suspended);
diff --git a/src/srun/allocate.c b/src/srun/allocate.c
index 3d5fed6f2af..e7cb36c1909 100644
--- a/src/srun/allocate.c
+++ b/src/srun/allocate.c
@@ -73,9 +73,10 @@
 #endif
 
 
-#define MAX_ALLOC_WAIT 60	/* seconds */
-#define MIN_ALLOC_WAIT  5	/* seconds */
-#define MAX_RETRIES    10
+#define MAX_ALLOC_WAIT	60	/* seconds */
+#define MIN_ALLOC_WAIT	5	/* seconds */
+#define MAX_RETRIES	10
+#define POLL_SLEEP	3	/* retry interval in seconds  */
 
 pthread_mutex_t msg_lock = PTHREAD_MUTEX_INITIALIZER;
 pthread_cond_t msg_cond = PTHREAD_COND_INITIALIZER;
@@ -97,10 +98,11 @@ static void _signal_while_allocating(int signo);
 static void  _intr_handler(int signo);
 
 #ifdef HAVE_BG
-#define POLL_SLEEP 3			/* retry interval in seconds  */
 static int _wait_bluegene_block_ready(
 	resource_allocation_response_msg_t *alloc);
 static int _blocks_dealloc(void);
+#else
+static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc);
 #endif
 
 #ifdef HAVE_CRAY_XT
@@ -252,7 +254,7 @@ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc)
 
 		if (rc == READY_JOB_FATAL)
 			break;				/* fatal error */
-		if (rc == READY_JOB_ERROR)		/* error */
+		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
 			continue;			/* retry */
 		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
 			break;
@@ -316,6 +318,60 @@ static int _blocks_dealloc(void)
 	bg_info_ptr = new_bg_ptr;
 	return rc;
 }
+#else
+/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */
+static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc)
+{
+	int is_ready = 0, i, rc;
+	int cur_delay = 0;
+	int suspend_time, resume_time, max_delay;
+
+	suspend_time = slurm_get_suspend_timeout();
+	resume_time  = slurm_get_resume_timeout();
+	if ((suspend_time == 0) || (resume_time == 0))
+		return 1;	/* Power save mode disabled */
+	max_delay = suspend_time + resume_time;
+	max_delay *= 5;		/* Allow for ResumeRate support */
+
+	pending_job_id = alloc->job_id;
+
+	for (i=0; (cur_delay < max_delay); i++) {
+		if (i) {
+			if (i == 1)
+				verbose("Waiting for nodes to boot");
+			else
+				debug("still waiting");
+			sleep(POLL_SLEEP);
+			cur_delay += POLL_SLEEP;
+		}
+
+		rc = slurm_job_node_ready(alloc->job_id);
+
+		if (rc == READY_JOB_FATAL)
+			break;				/* fatal error */
+		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
+			continue;			/* retry */
+		if ((rc & READY_JOB_STATE) == 0)	/* job killed */
+			break;
+		if (rc & READY_NODE_STATE) {		/* job and node ready */
+			is_ready = 1;
+			break;
+		}
+		if (destroy_job)
+			break;
+	}
+	if (is_ready) {
+		if (i > 0)
+     			verbose("Nodes %s are ready for job", alloc->node_list);
+	} else if (!destroy_job)
+		error("Nodes %s are still not ready", alloc->node_list);
+	else	/* allocation_interrupted and slurmctld not responing */
+		is_ready = 0;
+
+	pending_job_id = 0;
+
+	return is_ready;
+}
 #endif	/* HAVE_BG */
 
 #ifdef HAVE_CRAY_XT
@@ -408,6 +464,13 @@ allocate_nodes(void)
 				      "boot of the block.");
 			goto relinquish;
 		}
+#else
+		if (!_wait_nodes_ready(resp)) {
+			if(!destroy_job)
+				error("Something is wrong with the "
+				      "boot of the nodes.");
+			goto relinquish;
+		}
 #endif
 #ifdef HAVE_CRAY_XT
 		if (!_claim_reservation(resp)) {
-- 
GitLab