From f153d3487adf6197c4aded7c6c54314a3db648ec Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Thu, 19 Mar 2015 16:15:25 -0700
Subject: [PATCH] Apply bf_min_age_reserve in main scheduling loop

If SchedulerParameters value of bf_min_age_reserve is configured, then
a newly submitted job can start immediately even if there is a higher
priority non-runnable job which has been waiting for less time than
bf_min_age_reserve.
---
 NEWS                          |  4 ++++
 doc/man/man5/slurm.conf.5     |  5 ++++-
 src/slurmctld/job_mgr.c       | 30 +++++++++++++++++++++++++++---
 src/slurmctld/job_scheduler.c | 26 +++++++++++++++++++++++++-
 4 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/NEWS b/NEWS
index a0c36144fb9..112e7f94d69 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,10 @@ documents those changes that are of interest to users and administrators.
 
 * Changes in Slurm 14.11.6
 ==========================
+ -- If SchedulerParameters value of bf_min_age_reserve is configured, then
+    a newly submitted job can start immediately even if there is a higher
+    priority non-runnable job which has been waiting for less time than
+    bf_min_age_reserve.
 
 * Changes in Slurm 14.11.5
 ==========================
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index c15aa287122..cacbb2fffa0 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -2099,7 +2099,10 @@ This option applies only to \fBSchedulerType=sched/backfill\fR.
 .TP
 \fBbf_min_age_reserve=#\fR
 The backfill scheduler will not reserve resources for pending jobs until they
-have been pending for at least the specified number of seconds.
+have been runnable for at least the specified number of seconds.
+In addition jobs waiting for less than the specified number of seconds will
+not prevent a newly submitted job from starting immediately, even if the newly
+submitted job has a lower priority.
 This can be valuable if jobs lack time limits or all time limits have the same
 value.
 The default value is zero, which will reserve resources for any pending job
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index aa4f030227d..8c993bad10b 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -125,6 +125,7 @@ List   job_list = NULL;		/* job_record list */
 time_t last_job_update;		/* time of last update to job records */
 
 /* Local variables */
+static int      bf_min_age_reserve = 0;
 static uint32_t highest_prio = 0;
 static uint32_t lowest_prio  = TOP_PRIORITY;
 static int      hash_table_size = 0;
@@ -3755,7 +3756,9 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 			struct job_record **job_pptr, char **err_msg,
 			uint16_t protocol_version)
 {
-	static int defer_sched = -1;
+	static time_t sched_update = 0;
+	static int defer_sched = 0;
+	char *sched_params, *tmp_ptr;
 	int error_code, i;
 	bool no_alloc, top_prio, test_only, too_fragmented, independent;
 	struct job_record *job_ptr;
@@ -3821,12 +3824,21 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 	else
 		too_fragmented = false;
 
-	if (defer_sched == -1) {
-		char *sched_params = slurm_get_sched_params();
+	if (sched_update != slurmctld_conf.last_update) {
+		sched_update = slurmctld_conf.last_update;
+		sched_params = slurm_get_sched_params();
 		if (sched_params && strstr(sched_params, "defer"))
 			defer_sched = 1;
 		else
 			defer_sched = 0;
+		if (sched_params &&
+		    (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) {
+			bf_min_age_reserve = atoi(tmp_ptr + 19);
+			if (bf_min_age_reserve < 0)
+				bf_min_age_reserve = 0;
+		} else {
+			bf_min_age_reserve = 0;
+		}
 		xfree(sched_params);
 	}
 	if (defer_sched == 1)
@@ -3837,6 +3849,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 	else
 		top_prio = true;	/* don't bother testing,
 					 * it is not runable anyway */
+
 	if (immediate && (too_fragmented || (!top_prio) || (!independent))) {
 		job_ptr->job_state  = JOB_FAILED;
 		job_ptr->exit_code  = 1;
@@ -8536,6 +8549,8 @@ extern void sync_job_priorities(void)
 static bool _top_priority(struct job_record *job_ptr)
 {
 	struct job_details *detail_ptr = job_ptr->details;
+	time_t now = time(NULL);
+	int pend_time;
 	bool top;
 
 #ifdef HAVE_BG
@@ -8576,6 +8591,15 @@ static bool _top_priority(struct job_record *job_ptr)
 				 * indicative of job requeue */
 				continue;
 			}
+
+			if (bf_min_age_reserve) {
+				if (job_ptr2->details->begin_time == 0)
+					continue;
+				pend_time = difftime(now, job_ptr2->
+						     details->begin_time);
+				if (pend_time < bf_min_age_reserve)
+					continue;
+			}
 			if (!acct_policy_job_runnable_state(job_ptr2) ||
 			    !misc_policy_job_runnable_state(job_ptr2) ||
 			    !part_policy_job_runnable_state(job_ptr2) ||
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index 6334b86dfcc..12ed59cb66c 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -761,7 +761,7 @@ extern int schedule(uint32_t job_limit)
 	ListIterator job_iterator = NULL, part_iterator = NULL;
 	List job_queue = NULL;
 	int failed_part_cnt = 0, failed_resv_cnt = 0, job_cnt = 0;
-	int error_code, i, j, part_cnt, time_limit;
+	int error_code, i, j, part_cnt, time_limit, pend_time;
 	uint32_t job_depth = 0;
 	job_queue_rec_t *job_queue_rec;
 	struct job_record *job_ptr = NULL;
@@ -785,6 +785,7 @@ extern int schedule(uint32_t job_limit)
 	static bool wiki_sched = false;
 	static bool fifo_sched = false;
 	static int sched_timeout = 0;
+	static int bf_min_age_reserve = 0;
 	static int def_job_limit = 100;
 	static int max_jobs_per_part = 0;
 	static int defer_rpc_cnt = 0;
@@ -851,6 +852,15 @@ extern int schedule(uint32_t job_limit)
 			batch_sched_delay = 3;
 		}
 
+		if (sched_params &&
+		    (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) {
+			bf_min_age_reserve = atoi(tmp_ptr + 19);
+			if (bf_min_age_reserve < 0)
+				bf_min_age_reserve = 0;
+		} else {
+			bf_min_age_reserve = 0;
+		}
+
 		if (sched_params &&
 		    (tmp_ptr=strstr(sched_params, "build_queue_timeout=")))
 		/*                                 01234567890123456789 */
@@ -1324,6 +1334,20 @@ next_task:
 				}
 			}
 
+			if (fail_by_part && bf_min_age_reserve) {
+				/* Consider other jobs in this partition if
+				 * job has been waiting for less than
+				 * bf_min_age_reserve time */
+				if (job_ptr->details->begin_time == 0) {
+					fail_by_part = false;
+				} else {
+					pend_time = difftime(now,
+						job_ptr->details->begin_time);
+					if (pend_time < bf_min_age_reserve)
+						fail_by_part = false;
+				}
+			}
+
 			if (fail_by_part) {
 		 		/* do not schedule more jobs in this partition
 				 * or on nodes in this partition */
-- 
GitLab