diff --git a/NEWS b/NEWS index 980c77afed575136d540de44cbe9be90902353a4..50ceb9093b5e72894d4c3ae27a63601e9c2a5d98 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,9 @@ documents those changes that are of interest to users and admins. ============================= -- Add squeue option "--start" to report expected start time of pending jobs. -- Sched/backfill plugin modified to set expected start time of pending jobs. + -- Add SchedulerParameters option of "max_job_bf=#" to control how far down + the queue SLURM searched in an attempt to backfill jobs, default value + is 50 jobs. -- Fixed cause of squeue -o "%C" seg fault * Changes in SLURM 2.1.0-pre4 diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index ef6db346dacb372a9c4cf10af902c9f3b5724e56..635e51ce94411be27b0a952d72e8e75ab23d0194 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -281,7 +281,7 @@ Sched/wiki and wiki2 communications Default real memory size available per allocated CPU in MegaBytes. Used to avoid over\-subscribing memory and causing paging. \fBDefMemPerCPU\fR would generally be used if individual processors -are alocated to jobs (\fBSelectType=select/cons_res\fR). +are allocated to jobs (\fBSelectType=select/cons_res\fR). The default value is 0 (unlimited). Also see \fBDefMemPerNode\fR and \fBMaxMemPerCPU\fR. \fBDefMemPerCPU\fR and \fBDefMemPerNode\fR are mutually exclusive. @@ -294,7 +294,7 @@ not be stored, just collected). Default real memory size available per allocated node in MegaBytes. Used to avoid over\-subscribing memory and causing paging. \fBDefMemPerNode\fR would generally be used if whole nodes -are alocated to jobs (\fBSelectType=select/linear\fR) and +are allocated to jobs (\fBSelectType=select/linear\fR) and resources are shared (\fBShared=yes\fR or \fBShared=force\fR). The default value is 0 (unlimited). Also see \fBDefMemPerCPU\fR and \fBMaxMemPerNode\fR. @@ -423,7 +423,7 @@ match the actual hardware configuration if \fBSchedulerType=sched/gang\fR or \fBSelectType=select/cons_res\fR are configured (both of those plugins maintain resource allocation information using bitmaps for the cores in the system and must remain static, while the node's memory and disk space can -be esblished later). +be established later). .TP \fB2\fR Consider the configuration of each node to be that specified in the @@ -647,7 +647,7 @@ May not exceed 65533. Maximum real memory size available per allocated CPU in MegaBytes. Used to avoid over\-subscribing memory and causing paging. \fBMaxMemPerCPU\fR would generally be used if individual processors -are alocated to jobs (\fBSelectType=select/cons_res\fR). +are allocated to jobs (\fBSelectType=select/cons_res\fR). The default value is 0 (unlimited). Also see \fBDefMemPerCPU\fR and \fBMaxMemPerNode\fR. \fBMaxMemPerCPU\fR and \fBMaxMemPerNode\fR are mutually exclusive. @@ -660,7 +660,7 @@ not be stored, just collected). Maximum real memory size available per allocated node in MegaBytes. Used to avoid over\-subscribing memory and causing paging. \fBMaxMemPerNode\fR would generally be used if whole nodes -are alocated to jobs (\fBSelectType=select/linear\fR) and +are allocated to jobs (\fBSelectType=select/linear\fR) and resources are shared (\fBShared=yes\fR or \fBShared=force\fR). The default value is 0 (unlimited). Also see \fBDefMemPerNode\fR and \fBMaxMemPerCPU\fR. @@ -704,15 +704,15 @@ LAM MPI and Open MPI). \fBMpiParams\fR MPI parameters. Used to identify ports used by OpenMPI only and the input format is -"ports=12000\-12999" to identify a range of communcation ports to be used. +"ports=12000\-12999" to identify a range of communication ports to be used. .TP \fBOverTimeLimit\fR Number of minutes by which a job can exceed its time limit before -being cancelled. +being canceled. The configured job time limit is treated as a \fIsoft\fR limit. Adding \fBOverTimeLimit\fR to the \fIsoft\fR limit provides a \fIhard\fR -limit, at which point the job is cancelled. +limit, at which point the job is canceled. This is particularly useful for backfill scheduling, which bases upon each job's soft time limit. The default value is zero. @@ -751,20 +751,20 @@ method specification with the two options comma separated. \fBOFF\fR is the default value and disables job preemption and gang scheduling. This is the only option compatible with \fBSchedulerType=sched/wiki\fR -or \fBSchedulerType=sched/wiki2\fR (used by Maui and Moab respecitvely, +or \fBSchedulerType=sched/wiki2\fR (used by Maui and Moab respectively, which provide their own job preemption functionality). .TP \fBCANCEL\fR always cancel the job. .TP \fBCHECKPOINT\fR -preempts jobs by checkpointing them (if possible) or cancelling them. +preempts jobs by checkpointing them (if possible) or canceling them. .TP \fBGANG\fR enables gang scheduling (time slicing) of jobs in the same partition. .TP \fBREQUEUE\fR -preempts jobs by requeuing them (if possible) or cancelling them. +preempts jobs by requeuing them (if possible) or canceling them. .TP \fBSUSPEND\fR preempts jobs by suspending them. @@ -933,7 +933,7 @@ prevents regular users from viewing reservations. .TP \fBusage\fR (NON-SLURMDBD ACCOUNTING ONLY) prevents users from viewing -usage of any other user. This applys to sreport. +usage of any other user. This applies to sreport. .TP \fBusers\fR (NON-SLURMDBD ACCOUNTING ONLY) prevents users from viewing @@ -1184,12 +1184,26 @@ would run \fBxterm\fR with the title set to the SLURM jobid. .TP \fBSchedulerParameters\fR -The interprettation of this parameter varies by \fBSchedulerType\fR. -In the case of \fBSchedulerType=sched/backfill\fR, there is one -optional argument of the form "interval=#", where "#" is number of -seconds between iterations. Higher values result in less overhead -and responsivenss, The default value is 5 secondson BlueGene systems -and 10 seconds otherwise. +The interpretation of this parameter varies by \fBSchedulerType\fR. +Multiple options may be comma separated. +The following options apply only to \fBSchedulerType=sched/backfill\fR. +.RS +.TP +\fBinterval=#\fR +The number of seconds between iterations. +Higher values result in less overhead and responsiveness. +The default value is 5 seconds on BlueGene systems and 10 seconds otherwise. +.TP +\fBmax_job_bf=#\fR +The maximum number of jobs to attempt backfill scheduling for +(i.e. the queue depth). +Higher values result in more overhead and less responsiveness. +Until an attempt is made to backfill schedule a job, its expected +initiation time value will not be set. +The default value is 50. +In the case of large clusters (more than 1000 nodes) configured with\fBSelectType=select/cons_res\fR, setting a smaller value may be +desirable. +.RE .TP \fBSchedulerPort\fR @@ -1234,6 +1248,7 @@ priority job. Effectiveness of backfill scheduling is dependent upon users specifying job time limits, otherwise all jobs will have the same time limit and backfilling is impossible. +Note documentation for the \fBSchedulerParameters\fR option above. .TP \fBsched/gang\fR for gang scheduler (time\-slicing of parallel jobs). This also supports @@ -1441,7 +1456,7 @@ confirm the state of \fBslurmd\fR, the node will not be automatically set to a DOWN state indicating a non\-responsive \fBslurmd\fR, and some other tool will take responsibility for monitoring the state of each compute node and its \fBslurmd\fR daemon. -SLURM's hiearchical communication mechanism is used to ping the \fBslurmd\fR +SLURM's hierarchical communication mechanism is used to ping the \fBslurmd\fR daemons in order to minimize system noise and overhead. The default value is 300 seconds. The value may not exceed 65533 seconds. @@ -2300,10 +2315,10 @@ User ID of the job's owner. User name of the job's owner. .SH "NETWORK TOPOLOGY" -SLURM is able to optimze job allocations to minimize network contention. +SLURM is able to optimize job allocations to minimize network contention. Special SLURM logic is used to optimize allocations on systems with a three\-dimensional interconnect (BlueGene, Sun Constellation, etc.) -and information about configuring those systems are availble on +and information about configuring those systems are available on web pages available here: <https://computing.llnl.gov/linux/slurm/>. For a hierarchical network, SLURM needs to have detailed information about how nodes are configured on the network switches. diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 9897cfef17c5341622415cfff1bbec73b037ddbe..223cbc3deb962e3d6e4ccc32711d518165c8c217 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -90,12 +90,13 @@ int backfilled_jobs = 0; static bool new_work = false; static bool stop_backfill = false; static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER; +static int max_backfill_job_cnt = 50; #ifndef BACKFILL_INTERVAL # ifdef HAVE_BG # define BACKFILL_INTERVAL 5 # else -# define BACKFILL_INTERVAL 15 +# define BACKFILL_INTERVAL 10 # endif #endif @@ -103,10 +104,6 @@ static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER; * detailed logging for the entire slurmctld daemon */ #define __DEBUG 0 -/* Do not attempt to build job/resource/time record for - * more than MAX_BACKFILL_JOB_CNT records */ -#define MAX_BACKFILL_JOB_CNT 100 - /* Do not build job/resource/time record for more than this * far in the future, in seconds, currently one day */ #define BACKFILL_WINDOW (24 * 60 * 60) @@ -317,7 +314,7 @@ extern void *backfill_agent(void *args) struct timeval tv1, tv2; char tv_str[20], *sched_params, *tmp_ptr; time_t now; - int backfill_interval = 0, i, iter; + int backfill_interval = BACKFILL_INTERVAL, i, iter; static time_t last_backfill_time = 0; /* Read config, and partitions; Write jobs and nodes */ slurmctld_lock_t all_locks = { @@ -325,13 +322,17 @@ extern void *backfill_agent(void *args) sched_params = slurm_get_sched_params(); if (sched_params && (tmp_ptr=strstr(sched_params, "interval="))) - backfill_interval = atoi(tmp_ptr+9); - else - backfill_interval = BACKFILL_INTERVAL; + backfill_interval = atoi(tmp_ptr + 9); if (backfill_interval < 1) { fatal("Invalid backfill scheduler interval: %d", backfill_interval); } + if (sched_params && (tmp_ptr=strstr(sched_params, "max_job_bf="))) + max_backfill_job_cnt = atoi(tmp_ptr + 11); + if (max_backfill_job_cnt < 1) { + fatal("Invalid backfill scheduler max_job_bf: %d", + max_backfill_job_cnt); + } while (!stop_backfill) { iter = (BACKFILL_CHECK_SEC * 1000000) / @@ -345,7 +346,7 @@ extern void *backfill_agent(void *args) break; now = time(NULL); - if (!_more_work() || job_is_completing() || + if (!_more_work() || _job_is_completing() || (difftime(now, last_backfill_time) < backfill_interval)) continue; last_backfill_time = now; @@ -374,7 +375,7 @@ static void _attempt_backfill(void) uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; time_t now = time(NULL), later_start, start_res; - node_space_map_t node_space[MAX_BACKFILL_JOB_CNT + 3]; + node_space_map_t *node_space; static int sched_timeout = 0; if(!sched_timeout) @@ -389,6 +390,8 @@ static void _attempt_backfill(void) sort_job_queue(job_queue, job_queue_size); + node_space = xmalloc(sizeof(node_space_map_t) * + (max_backfill_job_cnt + 3)); node_space[0].begin_time = now; node_space[0].end_time = now + BACKFILL_WINDOW; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); @@ -536,7 +539,7 @@ static void _attempt_backfill(void) continue; } - if (node_space_recs == MAX_BACKFILL_JOB_CNT) { + if (node_space_recs == max_backfill_job_cnt) { /* Already have too many jobs to deal with */ break; } @@ -564,6 +567,7 @@ static void _attempt_backfill(void) if ((i = node_space[i].next) == 0) break; } + xfree(node_space); xfree(job_queue); } diff --git a/src/plugins/sched/backfill/backfill_wrapper.c b/src/plugins/sched/backfill/backfill_wrapper.c index a51c1a5c04376b5628bd0738d9f55e3a7e3ecb68..43370669c08e79a0db4874ed154356496c39588c 100644 --- a/src/plugins/sched/backfill/backfill_wrapper.c +++ b/src/plugins/sched/backfill/backfill_wrapper.c @@ -18,7 +18,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -71,7 +71,8 @@ int init( void ) pthread_mutex_lock( &thread_flag_mutex ); if ( backfill_thread ) { - debug2( "Backfill thread already running, not starting another" ); + debug2( "Backfill thread already running, not starting " + "another" ); pthread_mutex_unlock( &thread_flag_mutex ); return SLURM_ERROR; }