diff --git a/NEWS b/NEWS index a0c36144fb95d9f78a825fa47916ae61a32bbfcb..112e7f94d69d6f4533a75f8bf53f627f116a0962 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,10 @@ documents those changes that are of interest to users and administrators. * Changes in Slurm 14.11.6 ========================== + -- If SchedulerParameters value of bf_min_age_reserve is configured, then + a newly submitted job can start immediately even if there is a higher + priority non-runnable job which has been waiting for less time than + bf_min_age_reserve. * Changes in Slurm 14.11.5 ========================== diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index c15aa287122b3ed93f8cacd0903bd038690a2863..cacbb2fffa034c209ef2d96e3507dea56ec474a6 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -2099,7 +2099,10 @@ This option applies only to \fBSchedulerType=sched/backfill\fR. .TP \fBbf_min_age_reserve=#\fR The backfill scheduler will not reserve resources for pending jobs until they -have been pending for at least the specified number of seconds. +have been runnable for at least the specified number of seconds. +In addition jobs waiting for less than the specified number of seconds will +not prevent a newly submitted job from starting immediately, even if the newly +submitted job has a lower priority. This can be valuable if jobs lack time limits or all time limits have the same value. The default value is zero, which will reserve resources for any pending job diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index aa4f030227db1cc17cb93709bc6152ce88763cd9..8c993bad10bf0b4aa392c297ecc9a5fc6b5c4310 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -125,6 +125,7 @@ List job_list = NULL; /* job_record list */ time_t last_job_update; /* time of last update to job records */ /* Local variables */ +static int bf_min_age_reserve = 0; static uint32_t highest_prio = 0; static uint32_t lowest_prio = TOP_PRIORITY; static int hash_table_size = 0; @@ -3755,7 +3756,9 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, struct job_record **job_pptr, char **err_msg, uint16_t protocol_version) { - static int defer_sched = -1; + static time_t sched_update = 0; + static int defer_sched = 0; + char *sched_params, *tmp_ptr; int error_code, i; bool no_alloc, top_prio, test_only, too_fragmented, independent; struct job_record *job_ptr; @@ -3821,12 +3824,21 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, else too_fragmented = false; - if (defer_sched == -1) { - char *sched_params = slurm_get_sched_params(); + if (sched_update != slurmctld_conf.last_update) { + sched_update = slurmctld_conf.last_update; + sched_params = slurm_get_sched_params(); if (sched_params && strstr(sched_params, "defer")) defer_sched = 1; else defer_sched = 0; + if (sched_params && + (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) { + bf_min_age_reserve = atoi(tmp_ptr + 19); + if (bf_min_age_reserve < 0) + bf_min_age_reserve = 0; + } else { + bf_min_age_reserve = 0; + } xfree(sched_params); } if (defer_sched == 1) @@ -3837,6 +3849,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, else top_prio = true; /* don't bother testing, * it is not runable anyway */ + if (immediate && (too_fragmented || (!top_prio) || (!independent))) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; @@ -8536,6 +8549,8 @@ extern void sync_job_priorities(void) static bool _top_priority(struct job_record *job_ptr) { struct job_details *detail_ptr = job_ptr->details; + time_t now = time(NULL); + int pend_time; bool top; #ifdef HAVE_BG @@ -8576,6 +8591,15 @@ static bool _top_priority(struct job_record *job_ptr) * indicative of job requeue */ continue; } + + if (bf_min_age_reserve) { + if (job_ptr2->details->begin_time == 0) + continue; + pend_time = difftime(now, job_ptr2-> + details->begin_time); + if (pend_time < bf_min_age_reserve) + continue; + } if (!acct_policy_job_runnable_state(job_ptr2) || !misc_policy_job_runnable_state(job_ptr2) || !part_policy_job_runnable_state(job_ptr2) || diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 6334b86dfcc8a7c95f1118b3fd53299cad7bd034..12ed59cb66c59f30770f1e506e299d595fc3b832 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -761,7 +761,7 @@ extern int schedule(uint32_t job_limit) ListIterator job_iterator = NULL, part_iterator = NULL; List job_queue = NULL; int failed_part_cnt = 0, failed_resv_cnt = 0, job_cnt = 0; - int error_code, i, j, part_cnt, time_limit; + int error_code, i, j, part_cnt, time_limit, pend_time; uint32_t job_depth = 0; job_queue_rec_t *job_queue_rec; struct job_record *job_ptr = NULL; @@ -785,6 +785,7 @@ extern int schedule(uint32_t job_limit) static bool wiki_sched = false; static bool fifo_sched = false; static int sched_timeout = 0; + static int bf_min_age_reserve = 0; static int def_job_limit = 100; static int max_jobs_per_part = 0; static int defer_rpc_cnt = 0; @@ -851,6 +852,15 @@ extern int schedule(uint32_t job_limit) batch_sched_delay = 3; } + if (sched_params && + (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) { + bf_min_age_reserve = atoi(tmp_ptr + 19); + if (bf_min_age_reserve < 0) + bf_min_age_reserve = 0; + } else { + bf_min_age_reserve = 0; + } + if (sched_params && (tmp_ptr=strstr(sched_params, "build_queue_timeout="))) /* 01234567890123456789 */ @@ -1324,6 +1334,20 @@ next_task: } } + if (fail_by_part && bf_min_age_reserve) { + /* Consider other jobs in this partition if + * job has been waiting for less than + * bf_min_age_reserve time */ + if (job_ptr->details->begin_time == 0) { + fail_by_part = false; + } else { + pend_time = difftime(now, + job_ptr->details->begin_time); + if (pend_time < bf_min_age_reserve) + fail_by_part = false; + } + } + if (fail_by_part) { /* do not schedule more jobs in this partition * or on nodes in this partition */