From 5675c5f7a05e87111f1fbdcbbad35ce3986da1c4 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Thu, 7 Apr 2016 14:36:34 -0700 Subject: [PATCH] Log poor backfill configuration parameters Document and log cases where max jobs per user or partition is equal or greater than the max jobs test. In that case, a single user can easily stop all backfill scheduling. --- doc/man/man5/slurm.conf.5 | 5 ++++- src/plugins/sched/backfill/backfill.c | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 045a940cfea..67ff8abb7ff 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -2444,7 +2444,8 @@ scheduler. This can be especially helpful for systems with large numbers of partitions and jobs. The default value is 0, which means no limit. This option applies only to \fBSchedulerType=sched/backfill\fR. -Also see the \fBpartition_job_depth\fR option. +Also see the \fBpartition_job_depth\fR and \fBbf_max_job_test\fR options. +Set \fBbf_max_job_test\fR to a value much higher than \fBbf_max_job_part\fR. .TP \fBbf_max_job_start=#\fR The maximum number of jobs which can be initiated in a single iteration @@ -2470,6 +2471,8 @@ queue with jobs that cannot start and that prevent jobs from other users to start. This is similar to the MAXIJOB limit in Maui. The default value is 0, which means no limit. This option applies only to \fBSchedulerType=sched/backfill\fR. +Also see the \fBbf_max_job_part\fR and \fBbf_max_job_test\fR options. +Set \fBbf_max_job_test\fR to a value much higher than \fBbf_max_job_user\fR. .TP \fBbf_min_age_reserve=#\fR The backfill and main scheduling logic will not reserve resources for pending diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index c5fffba5af7..a664771574a 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -555,6 +555,11 @@ static void _load_config(void) max_backfill_job_per_part); max_backfill_job_per_part = 0; } + if ((max_backfill_job_per_part != 0) && + (max_backfill_job_per_part >= max_backfill_job_cnt)) { + error("bf_max_job_part >= bf_max_job_test (%u >= %u)", + max_backfill_job_per_part, max_backfill_job_cnt); + } if (sched_params && (tmp_ptr=strstr(sched_params, "bf_max_job_start="))) max_backfill_jobs_start = atoi(tmp_ptr + 17); @@ -571,6 +576,11 @@ static void _load_config(void) max_backfill_job_per_user); max_backfill_job_per_user = 0; } + if ((max_backfill_job_per_user != 0) && + (max_backfill_job_per_user >= max_backfill_job_cnt)) { + error("bf_max_job_user >= bf_max_job_test (%u >= %u)", + max_backfill_job_per_user, max_backfill_job_cnt); + } if (sched_params && (tmp_ptr=strstr(sched_params, "bf_min_age_reserve="))) @@ -1552,6 +1562,19 @@ next_task: info("backfill: table size limit of %u reached", max_backfill_job_cnt); } + if ((max_backfill_job_per_part != 0) && + (max_backfill_job_per_part >= + max_backfill_job_cnt)) { + error("bf_max_job_part >= bf_max_job_test (%u >= %u)", + max_backfill_job_per_part, + max_backfill_job_cnt); + } else if ((max_backfill_job_per_user != 0) && + (max_backfill_job_per_user >= + max_backfill_job_cnt)) { + error("bf_max_job_user >= bf_max_job_test (%u >= %u)", + max_backfill_job_per_user, + max_backfill_job_cnt); + } break; } -- GitLab