From c07b7b34b817911cbcc2fea6515d11d6c3de5b7d Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Thu, 6 Apr 2017 14:09:34 -0600 Subject: [PATCH] Revert commit e6e28f449bce78b6711d3bf88c40a6eab870e0e9 The problem will be solved by using per-partition QOS and setting MaxJobPer[User|Account] on that QOS. bug 3668 --- NEWS | 4 -- doc/man/man5/slurm.conf.5 | 14 +--- src/plugins/sched/backfill/backfill.c | 95 +-------------------------- 3 files changed, 4 insertions(+), 109 deletions(-) diff --git a/NEWS b/NEWS index 14b03fa1e05..dc934a60276 100644 --- a/NEWS +++ b/NEWS @@ -47,10 +47,6 @@ documents those changes that are of interest to users and administrators. in a federation. -- Add squeue --local and --sibling options to modify filtering of jobs on federated clusters. - -- Add SchedulerParameters option of bf_max_job_user_part to specifiy the - maximum number of jobs per user for any single partition. This differs from - bf_max_job_user in that a separate counter is applied to each partition - rather than having a single counter per user applied to all partitions. * Changes in Slurm 17.02.2 ========================== diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index ee031815b05..eb50c245c23 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -2560,24 +2560,14 @@ This option applies only to \fBSchedulerType=sched/backfill\fR. .TP \fBbf_max_job_user=#\fR The maximum number of jobs per user to attempt starting with the backfill -scheduler for ALL partitions. -One can set this limit to prevent users from flooding the backfill +scheduler. One can set this limit to prevent users from flooding the backfill queue with jobs that cannot start and that prevent jobs from other users to start. This is similar to the MAXIJOB limit in Maui. The default value is 0, which means no limit. This option applies only to \fBSchedulerType=sched/backfill\fR. -Also see the \fBbf_max_job_part\fR, \fBbf_max_job_test\fR and -\fBbf_max_job_user_part=#\fR options. +Also see the \fBbf_max_job_part\fR and \fBbf_max_job_test\fR options. Set \fBbf_max_job_test\fR to a value much higher than \fBbf_max_job_user\fR. .TP -\fBbf_max_job_user_part=#\fR -The maximum number of jobs per user per partition to attempt starting with the -backfill scheduler for any single partition. -The default value is 0, which means no limit. -This option applies only to \fBSchedulerType=sched/backfill\fR. -Also see the \fBbf_max_job_part\fR, \fBbf_max_job_test\fR and -\fBbf_max_job_user=#\fR options. -.TP \fBbf_min_age_reserve=#\fR The backfill and main scheduling logic will not reserve resources for pending jobs until they have been pending and runnable for at least the specified diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index d53b97e6c01..269c2e52a79 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -112,13 +112,6 @@ typedef struct node_space_map { int next; /* next record, by time, zero termination */ } node_space_map_t; -typedef struct user_part_rec { - uint16_t *njobs; - struct part_record *part_ptr; - uint32_t *uid; - int user_cnt; -} user_part_rec_t; - /* Diag statistics */ extern diag_stats_t slurmctld_diag_stats; uint32_t bf_sleep_usec = 0; @@ -141,7 +134,6 @@ static uint32_t bf_min_prio_reserve = 0; static int max_backfill_job_cnt = 100; static int max_backfill_job_per_part = 0; static int max_backfill_job_per_user = 0; -static int max_backfill_job_per_user_part = 0; static int max_backfill_jobs_start = 0; static bool backfill_continue = false; static bool assoc_limit_stop = false; @@ -615,23 +607,6 @@ static void _load_config(void) } } - if (sched_params && - (tmp_ptr = strstr(sched_params, "bf_max_job_user_part="))) { - max_backfill_job_per_user_part = atoi(tmp_ptr + 21); - if (max_backfill_job_per_user_part < 0) { - error("Invalid SchedulerParameters bf_max_job_user_part: %d", - max_backfill_job_per_user_part); - max_backfill_job_per_user_part = 0; - } - } else { - max_backfill_job_per_user_part = 0; - } - if ((max_backfill_job_per_user_part != 0) && - (max_backfill_job_per_user_part > max_backfill_job_cnt)) { - info("warning: bf_max_job_user_part > bf_max_job_test (%u > %u)", - max_backfill_job_per_user_part, max_backfill_job_cnt); - } - bf_min_age_reserve = 0; if (sched_params && (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) { @@ -913,7 +888,7 @@ static int _attempt_backfill(void) List job_queue; job_queue_rec_t *job_queue_rec; slurmdb_qos_rec_t *qos_ptr = NULL; - int bb, i, j, k, node_space_recs, mcs_select = 0; + int bb, i, j, node_space_recs, mcs_select = 0; struct job_record *job_ptr; struct part_record *part_ptr, **bf_part_ptr = NULL; uint32_t end_time, end_reserve, deadline_time_limit, boot_time; @@ -924,7 +899,6 @@ static int _attempt_backfill(void) time_t now, sched_start, later_start, start_res, resv_end, window_end; time_t orig_sched_start, orig_start_time = (time_t) 0; node_space_map_t *node_space; - user_part_rec_t *bf_user_part_ptr = NULL; struct timeval bf_time1, bf_time2; int rc = 0, error_code; int job_test_count = 0, test_time_count = 0, pend_time; @@ -1032,23 +1006,6 @@ static int _attempt_backfill(void) uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t)); njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t)); } - if (max_backfill_job_per_user_part) { - ListIterator part_iterator; - struct part_record *part_ptr; - bf_parts = list_count(part_list); - bf_user_part_ptr = xmalloc(sizeof(user_part_rec_t) * bf_parts); - part_iterator = list_iterator_create(part_list); - i = 0; - while ((part_ptr = (struct part_record *) - list_next(part_iterator))) { - bf_user_part_ptr[i].part_ptr = part_ptr; - bf_user_part_ptr[i].njobs = - xmalloc(BF_MAX_USERS * sizeof(uint16_t)); - bf_user_part_ptr[i++].uid = - xmalloc(BF_MAX_USERS * sizeof(uint32_t)); - } - list_iterator_destroy(part_iterator); - } sort_job_queue(job_queue); while (1) { @@ -1255,47 +1212,6 @@ next_task: job_ptr->part_ptr->name); } - if (max_backfill_job_per_user_part) { - bool skip_job = false; - for (j = 0; j < bf_parts; j++) { - if (bf_user_part_ptr[j].part_ptr != - job_ptr->part_ptr) - continue; - for (k = 0; k < bf_user_part_ptr[j].user_cnt; - k++) { - if (bf_user_part_ptr[j].uid[k] != - job_ptr->user_id) - continue; - if (bf_user_part_ptr[j].njobs[k]++ >= - max_backfill_job_per_user_part) - skip_job = true; - break; - } - if ((k == bf_user_part_ptr[j].user_cnt) && - (k < BF_MAX_USERS)) { - bf_user_part_ptr[j].user_cnt++; - bf_user_part_ptr[j].uid[k] = - job_ptr->user_id; - if (bf_user_part_ptr[j].njobs[k]++ >= - max_backfill_job_per_user_part) - skip_job = true; - } - break; - } - if (skip_job) { - if (debug_flags & DEBUG_FLAG_BACKFILL) - info("backfill: have already " - "checked %u jobs for user %u on " - "partition %s; skipping " - "job %u", - max_backfill_job_per_user_part, - job_ptr->user_id, - job_ptr->part_ptr->name, - job_ptr->job_id); - continue; - } - } - if (max_backfill_job_per_part) { bool skip_job = false; for (j = 0; j < bf_parts; j++) { @@ -1346,7 +1262,7 @@ next_task: "Total #users now %u", job_ptr->user_id, nuser); } else { - if (njobs[j] > max_backfill_job_per_user) { + if (njobs[j] >= max_backfill_job_per_user) { /* skip job */ if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already " @@ -1935,13 +1851,6 @@ next_task: xfree(bf_part_ptr); xfree(uid); xfree(njobs); - if (bf_user_part_ptr) { - for (i = 0; i < bf_parts; i++) { - xfree(bf_user_part_ptr[i].njobs); - xfree(bf_user_part_ptr[i].uid); - } - xfree(bf_user_part_ptr); - } FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); -- GitLab