From 202f58802d19de7d8a13296ed582a921fd0a9ecc Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 17 Sep 2009 15:43:34 +0000
Subject: [PATCH] Add SchedulerParameters option of "max_job_bf=#" to control
 how far down     the queue SLURM searched in an attempt to backfill jobs,
 default value     is 50 jobs.

---
 NEWS                                          |  3 +
 doc/man/man5/slurm.conf.5                     | 57 ++++++++++++-------
 src/plugins/sched/backfill/backfill.c         | 28 +++++----
 src/plugins/sched/backfill/backfill_wrapper.c |  5 +-
 4 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/NEWS b/NEWS
index 980c77afed5..50ceb9093b5 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,9 @@ documents those changes that are of interest to users and admins.
 =============================
  -- Add squeue option "--start" to report expected start time of pending jobs.
  -- Sched/backfill plugin modified to set expected start time of pending jobs.
+ -- Add SchedulerParameters option of "max_job_bf=#" to control how far down
+    the queue SLURM searched in an attempt to backfill jobs, default value 
+    is 50 jobs.
  -- Fixed cause of squeue -o "%C" seg fault
 
 * Changes in SLURM 2.1.0-pre4
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index ef6db346dac..635e51ce944 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -281,7 +281,7 @@ Sched/wiki and wiki2 communications
 Default real memory size available per allocated CPU in MegaBytes. 
 Used to avoid over\-subscribing memory and causing paging.
 \fBDefMemPerCPU\fR would generally be used if individual processors
-are alocated to jobs (\fBSelectType=select/cons_res\fR). 
+are allocated to jobs (\fBSelectType=select/cons_res\fR). 
 The default value is 0 (unlimited).
 Also see \fBDefMemPerNode\fR and \fBMaxMemPerCPU\fR.
 \fBDefMemPerCPU\fR and \fBDefMemPerNode\fR are mutually exclusive.
@@ -294,7 +294,7 @@ not be stored, just collected).
 Default real memory size available per allocated node in MegaBytes.
 Used to avoid over\-subscribing memory and causing paging.
 \fBDefMemPerNode\fR would generally be used if whole nodes
-are alocated to jobs (\fBSelectType=select/linear\fR) and 
+are allocated to jobs (\fBSelectType=select/linear\fR) and 
 resources are shared (\fBShared=yes\fR or \fBShared=force\fR).
 The default value is 0 (unlimited).
 Also see \fBDefMemPerCPU\fR and \fBMaxMemPerNode\fR.
@@ -423,7 +423,7 @@ match the actual hardware configuration if \fBSchedulerType=sched/gang\fR
 or \fBSelectType=select/cons_res\fR are configured (both of those plugins
 maintain resource allocation information using bitmaps for the cores in the
 system and must remain static, while the node's memory and disk space can
-be esblished later).
+be established later).
 .TP
 \fB2\fR
 Consider the configuration of each node to be that specified in the 
@@ -647,7 +647,7 @@ May not exceed 65533.
 Maximum real memory size available per allocated CPU in MegaBytes. 
 Used to avoid over\-subscribing memory and causing paging.
 \fBMaxMemPerCPU\fR would generally be used if individual processors
-are alocated to jobs (\fBSelectType=select/cons_res\fR).
+are allocated to jobs (\fBSelectType=select/cons_res\fR).
 The default value is 0 (unlimited).
 Also see \fBDefMemPerCPU\fR and \fBMaxMemPerNode\fR.
 \fBMaxMemPerCPU\fR and \fBMaxMemPerNode\fR are mutually exclusive.
@@ -660,7 +660,7 @@ not be stored, just collected).
 Maximum real memory size available per allocated node in MegaBytes.
 Used to avoid over\-subscribing memory and causing paging.
 \fBMaxMemPerNode\fR would generally be used if whole nodes
-are alocated to jobs (\fBSelectType=select/linear\fR) and
+are allocated to jobs (\fBSelectType=select/linear\fR) and
 resources are shared (\fBShared=yes\fR or \fBShared=force\fR).
 The default value is 0 (unlimited).
 Also see \fBDefMemPerNode\fR and \fBMaxMemPerCPU\fR.
@@ -704,15 +704,15 @@ LAM MPI and Open MPI).
 \fBMpiParams\fR
 MPI parameters. 
 Used to identify ports used by OpenMPI only and the input format is
-"ports=12000\-12999" to identify a range of communcation ports to be used.
+"ports=12000\-12999" to identify a range of communication ports to be used.
 
 .TP
 \fBOverTimeLimit\fR
 Number of minutes by which a job can exceed its time limit before 
-being cancelled. 
+being canceled. 
 The configured job time limit is treated as a \fIsoft\fR limit.
 Adding \fBOverTimeLimit\fR to the \fIsoft\fR limit provides a \fIhard\fR
-limit, at which point the job is cancelled.
+limit, at which point the job is canceled.
 This is particularly useful for backfill scheduling, which bases upon
 each job's soft time limit.
 The default value is zero.
@@ -751,20 +751,20 @@ method specification with the two options comma separated.
 \fBOFF\fR
 is the default value and disables job preemption and gang scheduling.
 This is the only option compatible with \fBSchedulerType=sched/wiki\fR 
-or \fBSchedulerType=sched/wiki2\fR (used by Maui and Moab respecitvely, 
+or \fBSchedulerType=sched/wiki2\fR (used by Maui and Moab respectively, 
 which provide their own job preemption functionality).
 .TP
 \fBCANCEL\fR
 always cancel the job.
 .TP
 \fBCHECKPOINT\fR
-preempts jobs by checkpointing them (if possible) or cancelling them.
+preempts jobs by checkpointing them (if possible) or canceling them.
 .TP
 \fBGANG\fR
 enables gang scheduling (time slicing) of jobs in the same partition.
 .TP
 \fBREQUEUE\fR
-preempts jobs by requeuing them (if possible) or cancelling them.
+preempts jobs by requeuing them (if possible) or canceling them.
 .TP
 \fBSUSPEND\fR
 preempts jobs by suspending them.
@@ -933,7 +933,7 @@ prevents regular users from viewing reservations.
 .TP
 \fBusage\fR 
 (NON-SLURMDBD ACCOUNTING ONLY) prevents users from viewing 
-usage of any other user.  This applys to sreport.
+usage of any other user.  This applies to sreport.
 .TP
 \fBusers\fR 
 (NON-SLURMDBD ACCOUNTING ONLY) prevents users from viewing 
@@ -1184,12 +1184,26 @@ would run \fBxterm\fR with the title set to the SLURM jobid.
 
 .TP
 \fBSchedulerParameters\fR
-The interprettation of this parameter varies by \fBSchedulerType\fR.
-In the case of \fBSchedulerType=sched/backfill\fR, there is one 
-optional argument of the form "interval=#", where "#" is number of
-seconds between iterations. Higher values result in less overhead 
-and responsivenss, The default value is 5 secondson BlueGene systems 
-and 10 seconds otherwise.
+The interpretation of this parameter varies by \fBSchedulerType\fR.
+Multiple options may be comma separated.
+The following options apply only to \fBSchedulerType=sched/backfill\fR.
+.RS
+.TP
+\fBinterval=#\fR
+The number of seconds between iterations. 
+Higher values result in less overhead and responsiveness.
+The default value is 5 seconds on BlueGene systems and 10 seconds otherwise.
+.TP
+\fBmax_job_bf=#\fR
+The maximum number of jobs to attempt backfill scheduling for 
+(i.e. the queue depth).
+Higher values result in more overhead and less responsiveness.
+Until an attempt is made to backfill schedule a job, its expected
+initiation time value will not be set.
+The default value is 50.
+In the case of large clusters (more than 1000 nodes) configured with\fBSelectType=select/cons_res\fR, setting a smaller value may be
+desirable.
+.RE
 
 .TP
 \fBSchedulerPort\fR
@@ -1234,6 +1248,7 @@ priority job.
 Effectiveness of backfill scheduling is dependent upon users specifying
 job time limits, otherwise all jobs will have the same time limit and
 backfilling is impossible.
+Note documentation for the \fBSchedulerParameters\fR option above.
 .TP
 \fBsched/gang\fR
 for gang scheduler (time\-slicing of parallel jobs). This also supports
@@ -1441,7 +1456,7 @@ confirm the state of \fBslurmd\fR, the node will not be automatically set to
 a DOWN state indicating a non\-responsive \fBslurmd\fR, and some other tool 
 will take responsibility for monitoring the state of each compute node 
 and its \fBslurmd\fR daemon.
-SLURM's hiearchical communication mechanism is used to ping the \fBslurmd\fR
+SLURM's hierarchical communication mechanism is used to ping the \fBslurmd\fR
 daemons in order to minimize system noise and overhead.
 The default value is 300 seconds.
 The value may not exceed 65533 seconds.
@@ -2300,10 +2315,10 @@ User ID of the job's owner.
 User name of the job's owner.
 
 .SH "NETWORK TOPOLOGY"
-SLURM is able to optimze job allocations to minimize network contention.
+SLURM is able to optimize job allocations to minimize network contention.
 Special SLURM logic is used to optimize allocations on systems with a 
 three\-dimensional interconnect (BlueGene, Sun Constellation, etc.)
-and information about configuring those systems are availble on 
+and information about configuring those systems are available on 
 web pages available here: <https://computing.llnl.gov/linux/slurm/>.
 For a hierarchical network, SLURM needs to have detailed information 
 about how nodes are configured on the network switches.
diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c
index 9897cfef17c..223cbc3deb9 100644
--- a/src/plugins/sched/backfill/backfill.c
+++ b/src/plugins/sched/backfill/backfill.c
@@ -90,12 +90,13 @@ int backfilled_jobs = 0;
 static bool new_work      = false;
 static bool stop_backfill = false;
 static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
+static int max_backfill_job_cnt = 50;
 
 #ifndef BACKFILL_INTERVAL
 #  ifdef HAVE_BG
 #    define BACKFILL_INTERVAL	5
 #  else
-#    define BACKFILL_INTERVAL	15
+#    define BACKFILL_INTERVAL	10
 #  endif
 #endif
 
@@ -103,10 +104,6 @@ static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
  * detailed logging for the entire slurmctld daemon */
 #define __DEBUG			0
 
-/* Do not attempt to build job/resource/time record for
- * more than MAX_BACKFILL_JOB_CNT records */
-#define MAX_BACKFILL_JOB_CNT	100
-
 /* Do not build job/resource/time record for more than this 
  * far in the future, in seconds, currently one day */
 #define BACKFILL_WINDOW		(24 * 60 * 60)
@@ -317,7 +314,7 @@ extern void *backfill_agent(void *args)
 	struct timeval tv1, tv2;
 	char tv_str[20], *sched_params, *tmp_ptr;
 	time_t now;
-	int backfill_interval = 0, i, iter;
+	int backfill_interval = BACKFILL_INTERVAL, i, iter;
 	static time_t last_backfill_time = 0;
 	/* Read config, and partitions; Write jobs and nodes */
 	slurmctld_lock_t all_locks = {
@@ -325,13 +322,17 @@ extern void *backfill_agent(void *args)
 
 	sched_params = slurm_get_sched_params();
 	if (sched_params && (tmp_ptr=strstr(sched_params, "interval=")))
-		backfill_interval = atoi(tmp_ptr+9);
-	else
-		backfill_interval = BACKFILL_INTERVAL;
+		backfill_interval = atoi(tmp_ptr + 9);
 	if (backfill_interval < 1) {
 		fatal("Invalid backfill scheduler interval: %d", 
 		      backfill_interval);
 	}
+	if (sched_params && (tmp_ptr=strstr(sched_params, "max_job_bf=")))
+		max_backfill_job_cnt = atoi(tmp_ptr + 11);
+	if (max_backfill_job_cnt < 1) {
+		fatal("Invalid backfill scheduler max_job_bf: %d", 
+		      max_backfill_job_cnt);
+	}
 
 	while (!stop_backfill) {
 		iter = (BACKFILL_CHECK_SEC * 1000000) /
@@ -345,7 +346,7 @@ extern void *backfill_agent(void *args)
 			break;
 
 		now = time(NULL);
-		if (!_more_work() || job_is_completing() ||
+		if (!_more_work() || _job_is_completing() ||
 		    (difftime(now, last_backfill_time) < backfill_interval))
 			continue;
 		last_backfill_time = now;
@@ -374,7 +375,7 @@ static void _attempt_backfill(void)
 	uint32_t min_nodes, max_nodes, req_nodes;
 	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
 	time_t now = time(NULL), later_start, start_res;
-	node_space_map_t node_space[MAX_BACKFILL_JOB_CNT + 3];
+	node_space_map_t *node_space;
 	static int sched_timeout = 0;
 
 	if(!sched_timeout)
@@ -389,6 +390,8 @@ static void _attempt_backfill(void)
 
 	sort_job_queue(job_queue, job_queue_size);
 
+	node_space = xmalloc(sizeof(node_space_map_t) * 
+			     (max_backfill_job_cnt + 3));
 	node_space[0].begin_time = now;
 	node_space[0].end_time = now + BACKFILL_WINDOW;
 	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
@@ -536,7 +539,7 @@ static void _attempt_backfill(void)
 			continue;
 		}
 
-		if (node_space_recs == MAX_BACKFILL_JOB_CNT) {
+		if (node_space_recs == max_backfill_job_cnt) {
 			/* Already have too many jobs to deal with */
 			break;
 		}
@@ -564,6 +567,7 @@ static void _attempt_backfill(void)
 		if ((i = node_space[i].next) == 0)
 			break;
 	}
+	xfree(node_space);
 	xfree(job_queue);
 }
 
diff --git a/src/plugins/sched/backfill/backfill_wrapper.c b/src/plugins/sched/backfill/backfill_wrapper.c
index a51c1a5c043..43370669c08 100644
--- a/src/plugins/sched/backfill/backfill_wrapper.c
+++ b/src/plugins/sched/backfill/backfill_wrapper.c
@@ -18,7 +18,7 @@
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
+ *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and 
  *  distribute linked combinations including the two. You must obey the GNU 
  *  General Public License in all respects for all of the code used other than 
@@ -71,7 +71,8 @@ int init( void )
 
 	pthread_mutex_lock( &thread_flag_mutex );
 	if ( backfill_thread ) {
-		debug2( "Backfill thread already running, not starting another" );
+		debug2( "Backfill thread already running, not starting "
+			"another" );
 		pthread_mutex_unlock( &thread_flag_mutex );
 		return SLURM_ERROR;
 	}
-- 
GitLab