Skip to content
Snippets Groups Projects
Commit 057a55cf authored by Morris Jette's avatar Morris Jette
Browse files

Add SchedulerParameters bf_max_job_array_resv

Added SchedulerParameters options of bf_max_job_array_resv to control how
many tasks of a job array should have resources reserved for them.
parent 36ce0bbf
No related branches found
No related tags found
No related merge requests found
......@@ -15,6 +15,8 @@ documents those changes that are of interest to users and admins.
RebootProgram to reboot nodes allocated to a job before it begins execution.
-- Added squeue -O/--Format option that makes all job and step fields available
for printing.
-- Added SchedulerParameters options of bf_max_job_array_resv to control how
many tasks of a job array should have resources reserved for them.
* Changes in Slurm 14.11.0pre1
==============================
......
......@@ -73,6 +73,8 @@ CONFIGURATION FILE CHANGES (see man appropriate man page for details)
-- Added HealthCheckNodeState option of "cycle" to cycle through the compute
nodes over the course of HealthCheckInterval rather than running all at
the same time.
-- Added SchedulerParameters options of bf_max_job_array_resv to control how
many tasks of a job array should have resources reserved for them.
DBD CONFIGURATION FILE CHANGES (see "man slurmdbd.conf" for details)
====================================================================
......
......@@ -1936,6 +1936,17 @@ Higher values result in less overhead and better responsiveness.
The default value is 30 seconds.
This option applies only to \fBSchedulerType=sched/backfill\fR.
.TP
\fBbf_max_job_array_resv=#\fR
The maximum number of tasks from a job array for which to reserve resources in
the future.
Since job arrays can potentially have millions of tasks, the overhead in
reserving resources for all tasks can be prohibitive.
In addition various limits may prevent all the jobs from starting at the
expected times.
This has no impact upon the number of tasks from a job array that can be
started immediately, only those tasks expected to start at some future time.
The default value is 20 tasks.
.TP
\fBbf_max_job_part=#\fR
The maximum number of jobs per partition to attempt backfill scheduling for,
not counting jobs which cannot be started due to an association resource
......
......@@ -90,25 +90,11 @@
#include "src/slurmctld/srun_comm.h"
#include "backfill.h"
#ifndef BACKFILL_INTERVAL
# define BACKFILL_INTERVAL 30
#endif
#ifndef BACKFILL_RESOLUTION
# define BACKFILL_RESOLUTION 60
#endif
/* Do not build job/resource/time record for more than this
* far in the future, in seconds, currently one day */
#ifndef BACKFILL_WINDOW
#define BACKFILL_WINDOW (24 * 60 * 60)
#endif
/* Length of uid/njobs arrays used for limiting the number of jobs
* per user considered in each backfill iteration */
#ifndef BF_MAX_USERS
# define BF_MAX_USERS 1000
#endif
#define BACKFILL_INTERVAL 30
#define BACKFILL_RESOLUTION 60
#define BACKFILL_WINDOW (24 * 60 * 60)
#define BF_MAX_USERS 1000
#define BF_MAX_JOB_ARRAY_RESV 20
#define SLURMCTLD_THREAD_LIMIT 5
#define SCHED_TIMEOUT 2000000 /* time in micro-seconds */
......@@ -135,6 +121,7 @@ static uint32_t debug_flags = 0;
static int backfill_interval = BACKFILL_INTERVAL;
static int backfill_resolution = BACKFILL_RESOLUTION;
static int backfill_window = BACKFILL_WINDOW;
static int bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV;
static int max_backfill_job_cnt = 100;
static int max_backfill_job_per_part = 0;
static int max_backfill_job_per_user = 0;
......@@ -482,10 +469,7 @@ static void _load_config(void)
max_backfill_job_cnt);
max_backfill_job_cnt = 50;
}
/* "bf_res=" is vestigial from version 2.3 and can be removed later.
* Only "bf_resolution=" is documented. */
if (sched_params && (tmp_ptr=strstr(sched_params, "bf_res=")))
backfill_resolution = atoi(tmp_ptr + 7);
if (sched_params && (tmp_ptr=strstr(sched_params, "bf_resolution=")))
backfill_resolution = atoi(tmp_ptr + 14);
if (backfill_resolution < 1) {
......@@ -494,6 +478,15 @@ static void _load_config(void)
backfill_resolution = BACKFILL_RESOLUTION;
}
if (sched_params &&
(tmp_ptr=strstr(sched_params, "bf_max_job_array_resv=")))
bf_max_job_array_resv = atoi(tmp_ptr + 22);
if (bf_max_job_array_resv < 0) {
error("Invalid SchedulerParameters bf_max_job_array_resv: %d",
bf_max_job_array_resv);
bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV;
}
if (sched_params && (tmp_ptr=strstr(sched_params, "bf_max_job_part=")))
max_backfill_job_per_part = atoi(tmp_ptr + 16);
if (max_backfill_job_per_part < 0) {
......@@ -1287,7 +1280,8 @@ next_task:
} else {
test_array_count++;
}
if (test_array_count < job_ptr->array_recs->task_cnt)
if ((test_array_count < bf_max_job_array_resv) &&
(test_array_count < job_ptr->array_recs->task_cnt))
goto next_task;
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment