From 1e9673213e231cee6a68b949b0a951574359caa9 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Mon, 22 Aug 2016 19:14:55 -0600
Subject: [PATCH] Complete delay-boot support

Remove DelayBoot configuration parameter from knl.conf and move to
  SchedulerParameters.
Set job default delay_boot value from configuration parameter
Enforce delay-boot scheduling logic.
---
 doc/man/man1/salloc.1          | 12 +++++----
 doc/man/man1/sbatch.1          | 12 +++++----
 doc/man/man1/srun.1            | 12 +++++----
 doc/man/man5/knl.conf.5        |  8 ------
 doc/man/man5/slurm.conf.5      |  9 +++++++
 src/slurmctld/job_mgr.c        | 46 +++++++++++++++++++++-------------
 src/slurmctld/node_scheduler.c | 23 ++++++++++++++---
 src/slurmctld/slurmctld.h      | 10 ++++----
 8 files changed, 83 insertions(+), 49 deletions(-)

diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1
index 84cbddd7cab..559e95a6041 100644
--- a/doc/man/man1/salloc.1
+++ b/doc/man/man1/salloc.1
@@ -387,12 +387,14 @@ YYYY\-MM\-DD[THH:MM[:SS]]]
 
 .TP
 \fB\-\-delay\-boot\fR=<\fIminutes\fR>
-If nodes with the features requested by the job are expected to be available
-within this time period, then wait for those nodes rather than booting currently
-available nodes into the desired state.
+Do not reboot nodes in order to satified this job's feature specification if
+the job has been eligible to run for less than this time period.
+If the job has waited for less than the specified period, it will use only
+nodes which already have the specified features.
 The argument is in units of minutes.
-A default value may be specified using the \fBDelayBoot\fR configuration
-parameter in the knl.conf file, otherwise the default value is zero (no delay).
+A default value may be set by a system administrator using the \fBdelay_boot\fR
+option of the \fBSchedulerParameters\fR configuration parameter in the
+slurm.conf file, otherwise the default value is zero (no delay).
 
 .TP
 \fB\-d\fR, \fB\-\-dependency\fR=<\fIdependency_list\fR>
diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1
index f7ed187471f..25cd961eaf0 100644
--- a/doc/man/man1/sbatch.1
+++ b/doc/man/man1/sbatch.1
@@ -401,12 +401,14 @@ YYYY\-MM\-DD[THH:MM[:SS]]]
 
 .TP
 \fB\-\-delay\-boot\fR=<\fIminutes\fR>
-If nodes with the features requested by the job are expected to be available
-within this time period, then wait for those nodes rather than booting currently
-available nodes into the desired state.
+Do not reboot nodes in order to satified this job's feature specification if
+the job has been eligible to run for less than this time period.
+If the job has waited for less than the specified period, it will use only
+nodes which already have the specified features.
 The argument is in units of minutes.
-A default value may be specified using the \fBDelayBoot\fR configuration
-parameter in the knl.conf file, otherwise the default value is zero (no delay).
+A default value may be set by a system administrator using the \fBdelay_boot\fR
+option of the \fBSchedulerParameters\fR configuration parameter in the
+slurm.conf file, otherwise the default value is zero (no delay).
 
 .TP
 \fB\-d\fR, \fB\-\-dependency\fR=<\fIdependency_list\fR>
diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1
index 579e3193370..c9bd6c3c739 100644
--- a/doc/man/man1/srun.1
+++ b/doc/man/man1/srun.1
@@ -596,12 +596,14 @@ This option applies only to job allocations.
 
 .TP
 \fB\-\-delay\-boot\fR=<\fIminutes\fR>
-If nodes with the features requested by the job are expected to be available
-within this time period, then wait for those nodes rather than booting currently
-available nodes into the desired state.
+Do not reboot nodes in order to satified this job's feature specification if
+the job has been eligible to run for less than this time period.
+If the job has waited for less than the specified period, it will use only
+nodes which already have the specified features.
 The argument is in units of minutes.
-A default value may be specified using the \fBDelayBoot\fR configuration
-parameter in the knl.conf file, otherwise the default value is zero (no delay).
+A default value may be set by a system administrator using the \fBdelay_boot\fR
+option of the \fBSchedulerParameters\fR configuration parameter in the
+slurm.conf file, otherwise the default value is zero (no delay).
 
 This option applies only to job allocations.
 
diff --git a/doc/man/man5/knl.conf.5 b/doc/man/man5/knl.conf.5
index 3a115a24c4d..f1abe819581 100644
--- a/doc/man/man5/knl.conf.5
+++ b/doc/man/man5/knl.conf.5
@@ -122,14 +122,6 @@ The value can include one of the possible values identified with the
 \fBAllowNUMA\fR configuration parameter above.
 The default value is "a2a".
 
-.TP
-\fBDelayBoot\fR
-If nodes with the MCDRAM and NUMA state requested by the job are expected to
-be available within this time period, then wait for those nodes rather than
-booting currently available nodes into the desired state.
-May be overridden by a job's --delay-boot option.
-The argument value is in minutes with a default value of zero (no delay).
-
 .TP
 \fBLogFile\fR
 Fully qualified path to a log file.
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 501eedf6b37..8e4ed37c345 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -2595,6 +2595,15 @@ This option may improve system responsiveness when large numbers of jobs
 (many hundreds) are submitted at the same time, but it will delay the
 initiation time of individual jobs. Also see \fBdefault_queue_depth\fR above.
 .TP
+\fBdelay_boot=#\fR
+Do not reboot nodes in order to satified this job's feature specification if
+the job has been eligible to run for less than this time period.
+If the job has waited for less than the specified period, it will use only
+nodes which already have the specified features.
+The argument is in units of minutes.
+Individual jobs may override this default value with the \fB\-\-delay\-boot\fR
+option.
+.TP
 \fBdisable_user_top\fB
 Disable use of the "scontrol top" command by non-privileged users.
 .TP
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 7dfe053fa44..46d00679e6a 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -130,6 +130,7 @@ time_t last_job_update;		/* time of last update to job records */
 
 /* Local variables */
 static int      bf_min_age_reserve = 0;
+static uint32_t delay_boot = 0;
 static uint32_t highest_prio = 0;
 static uint32_t lowest_prio  = TOP_PRIORITY;
 static int      hash_table_size = 0;
@@ -4114,12 +4115,35 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 {
 	static time_t sched_update = 0;
 	static int defer_sched = 0;
-	char *sched_params, *tmp_ptr;
+	char *key, *sched_params, *tmp_ptr;
 	int error_code, i;
 	bool no_alloc, top_prio, test_only, too_fragmented, independent;
 	struct job_record *job_ptr;
 	time_t now = time(NULL);
 
+	if (sched_update != slurmctld_conf.last_update) {
+		sched_update = slurmctld_conf.last_update;
+		sched_params = slurm_get_sched_params();
+		if (sched_params && strstr(sched_params, "defer"))
+			defer_sched = 1;
+		else
+			defer_sched = 0;
+		key = strstr(sched_params, "delay_boot=");
+		if (key) {
+			i = time_str2secs(key + 11);
+			if (i != NO_VAL)
+				delay_boot = i;
+		}
+		bf_min_age_reserve = 0;
+		if (sched_params &&
+		    (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) {
+			int min_age = atoi(tmp_ptr + 19);
+			if (min_age > 0)
+				bf_min_age_reserve = min_age;
+		}
+		xfree(sched_params);
+	}
+
 	if (job_specs->array_bitmap)
 		i = bit_set_count(job_specs->array_bitmap);
 	else
@@ -4177,22 +4201,6 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 	else
 		too_fragmented = false;
 
-	if (sched_update != slurmctld_conf.last_update) {
-		sched_update = slurmctld_conf.last_update;
-		sched_params = slurm_get_sched_params();
-		if (sched_params && strstr(sched_params, "defer"))
-			defer_sched = 1;
-		else
-			defer_sched = 0;
-		bf_min_age_reserve = 0;
-		if (sched_params &&
-		    (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) {
-			int min_age = atoi(tmp_ptr + 19);
-			if (min_age > 0)
-				bf_min_age_reserve = min_age;
-		}
-		xfree(sched_params);
-	}
 	if (defer_sched == 1)
 		too_fragmented = true;
 
@@ -7051,7 +7059,9 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
 	job_ptr->job_state  = JOB_PENDING;
 	job_ptr->time_limit = job_desc->time_limit;
 	job_ptr->deadline   = job_desc->deadline;
-	if (job_desc->delay_boot != NO_VAL)
+	if (job_desc->delay_boot == NO_VAL)
+		job_ptr->delay_boot   = delay_boot;
+	else
 		job_ptr->delay_boot   = job_desc->delay_boot;
 	if (job_desc->time_min != NO_VAL)
 		job_ptr->time_min = job_desc->time_min;
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 2678b4dfb1a..79a6a386a5d 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -1068,8 +1068,8 @@ _get_req_features(struct node_set *node_set_ptr, int node_set_size,
 		job_ptr->details->req_node_bitmap = NULL;
 	}
 	saved_min_cpus = job_ptr->details->min_cpus;
-	/* Don't mess with max_cpus here since it is only set (as of
-	 * 2.2 to be a limit and not user configurable. */
+	/* Don't mess with max_cpus here since it is only set to be a limit
+	 * and not user configurable. */
 	job_ptr->details->min_cpus = 1;
 	tmp_node_set_ptr = xmalloc(sizeof(struct node_set) * node_set_size * 2);
 
@@ -2132,6 +2132,23 @@ static bool _first_array_task(struct job_record *job_ptr)
 	return false;
 }
 
+/* Return TRUE if the job can be rebooted now to change the node's features */
+static bool _job_reboot_test(struct job_record *job_ptr, bool test_only)
+{
+	if (!node_features_g_user_update(job_ptr->user_id))
+		return false;
+
+	if (test_only)
+		return true;
+
+	if (job_ptr->details && 
+	    ((job_ptr->details->begin_time == 0) ||
+	     ((job_ptr->details->begin_time+job_ptr->delay_boot) > time(NULL))))
+		return false;
+
+	return true;
+}
+
 /*
  * select_nodes - select and allocate nodes to a specific job
  * IN job_ptr - pointer to the job record
@@ -2237,7 +2254,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only,
 	}
 
 	/* build sets of usable nodes based upon their configuration */
-	can_reboot = node_features_g_user_update(job_ptr->user_id);
+	can_reboot = _job_reboot_test(job_ptr, test_only);
 	error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size,
 				      err_msg, test_only, can_reboot);
 	if (error_code)
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index a75c2f4a837..0bf47708919 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -582,6 +582,11 @@ struct job_record {
 	uint16_t batch_flag;		/* 1 or 2 if batch job (with script),
 					 * 2 indicates retry mode (one retry) */
 	char *batch_host;		/* host executing batch script */
+	double billable_tres;		/* calculated billable tres for the
+					 * job, as defined by the partition's
+					 * billing weight. Recalculated upon job
+					 * resize.  Cannot be calculated until
+					 * the job is alloocated resources. */
 	uint32_t bit_flags;             /* various flags */
 	char *burst_buffer;		/* burst buffer specification */
 	char *burst_buffer_state;	/* burst buffer state */
@@ -594,11 +599,6 @@ struct job_record {
 					 * by the job, decremented while job is
 					 * completing (N/A for bluegene
 					 * systems) */
-	double billable_tres;		/* calculated billable tres for the
-					 * job, as defined by the partition's
-					 * billing weight. Recalculated upon job
-					 * resize.  Cannot be calculated until
-					 * the job is alloocated resources. */
 	uint16_t cr_enabled;            /* specify if Consumable Resources
 					 * is enabled. Needed since CR deals
 					 * with a finer granularity in its
-- 
GitLab