diff --git a/NEWS b/NEWS index 686aff207fc6994bcc3b406079422df3576cfdb0..1cad266ad44a33f9ab222c959a9b6545d0080ca3 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,8 @@ documents those changes that are of interest to users and admins. RebootProgram to reboot nodes allocated to a job before it begins execution. -- Added squeue -O/--Format option that makes all job and step fields available for printing. + -- Added SchedulerParameters options of bf_max_job_array_resv to control how + many tasks of a job array should have resources reserved for them. * Changes in Slurm 14.11.0pre1 ============================== diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 633d5ce37932167a809e0d064e87616b83a444ba..8d6452648651e905a4e591047fbd917beae259bc 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -73,6 +73,8 @@ CONFIGURATION FILE CHANGES (see man appropriate man page for details) -- Added HealthCheckNodeState option of "cycle" to cycle through the compute nodes over the course of HealthCheckInterval rather than running all at the same time. + -- Added SchedulerParameters options of bf_max_job_array_resv to control how + many tasks of a job array should have resources reserved for them. DBD CONFIGURATION FILE CHANGES (see "man slurmdbd.conf" for details) ==================================================================== diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 1fde13b437143040ff26cf99c5e7d4803c2a0a35..bf21e9d5589e70a992897d38960895fab86e1f51 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1936,6 +1936,17 @@ Higher values result in less overhead and better responsiveness. The default value is 30 seconds. This option applies only to \fBSchedulerType=sched/backfill\fR. .TP +\fBbf_max_job_array_resv=#\fR +The maximum number of tasks from a job array for which to reserve resources in +the future. +Since job arrays can potentially have millions of tasks, the overhead in +reserving resources for all tasks can be prohibitive. +In addition various limits may prevent all the jobs from starting at the +expected times. +This has no impact upon the number of tasks from a job array that can be +started immediately, only those tasks expected to start at some future time. +The default value is 20 tasks. +.TP \fBbf_max_job_part=#\fR The maximum number of jobs per partition to attempt backfill scheduling for, not counting jobs which cannot be started due to an association resource diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 8073120e1c3724cab5d6c13144ab57cc101069cd..7187592590ece8d6957c68835f5b8796e55b2c83 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -90,25 +90,11 @@ #include "src/slurmctld/srun_comm.h" #include "backfill.h" -#ifndef BACKFILL_INTERVAL -# define BACKFILL_INTERVAL 30 -#endif - -#ifndef BACKFILL_RESOLUTION -# define BACKFILL_RESOLUTION 60 -#endif - -/* Do not build job/resource/time record for more than this - * far in the future, in seconds, currently one day */ -#ifndef BACKFILL_WINDOW -#define BACKFILL_WINDOW (24 * 60 * 60) -#endif - -/* Length of uid/njobs arrays used for limiting the number of jobs - * per user considered in each backfill iteration */ -#ifndef BF_MAX_USERS -# define BF_MAX_USERS 1000 -#endif +#define BACKFILL_INTERVAL 30 +#define BACKFILL_RESOLUTION 60 +#define BACKFILL_WINDOW (24 * 60 * 60) +#define BF_MAX_USERS 1000 +#define BF_MAX_JOB_ARRAY_RESV 20 #define SLURMCTLD_THREAD_LIMIT 5 #define SCHED_TIMEOUT 2000000 /* time in micro-seconds */ @@ -135,6 +121,7 @@ static uint32_t debug_flags = 0; static int backfill_interval = BACKFILL_INTERVAL; static int backfill_resolution = BACKFILL_RESOLUTION; static int backfill_window = BACKFILL_WINDOW; +static int bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; static int max_backfill_job_cnt = 100; static int max_backfill_job_per_part = 0; static int max_backfill_job_per_user = 0; @@ -482,10 +469,7 @@ static void _load_config(void) max_backfill_job_cnt); max_backfill_job_cnt = 50; } - /* "bf_res=" is vestigial from version 2.3 and can be removed later. - * Only "bf_resolution=" is documented. */ - if (sched_params && (tmp_ptr=strstr(sched_params, "bf_res="))) - backfill_resolution = atoi(tmp_ptr + 7); + if (sched_params && (tmp_ptr=strstr(sched_params, "bf_resolution="))) backfill_resolution = atoi(tmp_ptr + 14); if (backfill_resolution < 1) { @@ -494,6 +478,15 @@ static void _load_config(void) backfill_resolution = BACKFILL_RESOLUTION; } + if (sched_params && + (tmp_ptr=strstr(sched_params, "bf_max_job_array_resv="))) + bf_max_job_array_resv = atoi(tmp_ptr + 22); + if (bf_max_job_array_resv < 0) { + error("Invalid SchedulerParameters bf_max_job_array_resv: %d", + bf_max_job_array_resv); + bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; + } + if (sched_params && (tmp_ptr=strstr(sched_params, "bf_max_job_part="))) max_backfill_job_per_part = atoi(tmp_ptr + 16); if (max_backfill_job_per_part < 0) { @@ -1287,7 +1280,8 @@ next_task: } else { test_array_count++; } - if (test_array_count < job_ptr->array_recs->task_cnt) + if ((test_array_count < bf_max_job_array_resv) && + (test_array_count < job_ptr->array_recs->task_cnt)) goto next_task; } }