From 5675c5f7a05e87111f1fbdcbbad35ce3986da1c4 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Thu, 7 Apr 2016 14:36:34 -0700
Subject: [PATCH] Log poor backfill configuration parameters

Document and log cases where max jobs per user or partition is
  equal or greater than the max jobs test. In that case, a single
  user can easily stop all backfill scheduling.
---
 doc/man/man5/slurm.conf.5             |  5 ++++-
 src/plugins/sched/backfill/backfill.c | 23 +++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 045a940cfea..67ff8abb7ff 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -2444,7 +2444,8 @@ scheduler. This can be especially helpful for systems with large numbers of
 partitions and jobs.
 The default value is 0, which means no limit.
 This option applies only to \fBSchedulerType=sched/backfill\fR.
-Also see the \fBpartition_job_depth\fR option.
+Also see the \fBpartition_job_depth\fR and \fBbf_max_job_test\fR options.
+Set \fBbf_max_job_test\fR to a value much higher than \fBbf_max_job_part\fR.
 .TP
 \fBbf_max_job_start=#\fR
 The maximum number of jobs which can be initiated in a single iteration
@@ -2470,6 +2471,8 @@ queue with jobs that cannot start and that prevent jobs from other users
 to start.  This is similar to the MAXIJOB limit in Maui.
 The default value is 0, which means no limit.
 This option applies only to \fBSchedulerType=sched/backfill\fR.
+Also see the \fBbf_max_job_part\fR and \fBbf_max_job_test\fR options.
+Set \fBbf_max_job_test\fR to a value much higher than \fBbf_max_job_user\fR.
 .TP
 \fBbf_min_age_reserve=#\fR
 The backfill and main scheduling logic will not reserve resources for pending
diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c
index c5fffba5af7..a664771574a 100644
--- a/src/plugins/sched/backfill/backfill.c
+++ b/src/plugins/sched/backfill/backfill.c
@@ -555,6 +555,11 @@ static void _load_config(void)
 		      max_backfill_job_per_part);
 		max_backfill_job_per_part = 0;
 	}
+	if ((max_backfill_job_per_part != 0) &&
+	    (max_backfill_job_per_part >= max_backfill_job_cnt)) {
+		error("bf_max_job_part >= bf_max_job_test (%u >= %u)",
+		      max_backfill_job_per_part, max_backfill_job_cnt);
+	}
 
 	if (sched_params && (tmp_ptr=strstr(sched_params, "bf_max_job_start=")))
 		max_backfill_jobs_start = atoi(tmp_ptr + 17);
@@ -571,6 +576,11 @@ static void _load_config(void)
 		      max_backfill_job_per_user);
 		max_backfill_job_per_user = 0;
 	}
+	if ((max_backfill_job_per_user != 0) &&
+	    (max_backfill_job_per_user >= max_backfill_job_cnt)) {
+		error("bf_max_job_user >= bf_max_job_test (%u >= %u)",
+		      max_backfill_job_per_user, max_backfill_job_cnt);
+	}
 
 	if (sched_params &&
 	    (tmp_ptr=strstr(sched_params, "bf_min_age_reserve=")))
@@ -1552,6 +1562,19 @@ next_task:
 				info("backfill: table size limit of %u reached",
 				     max_backfill_job_cnt);
 			}
+			if ((max_backfill_job_per_part != 0) &&
+			    (max_backfill_job_per_part >=
+			     max_backfill_job_cnt)) {
+				error("bf_max_job_part >= bf_max_job_test (%u >= %u)",
+				      max_backfill_job_per_part,
+				      max_backfill_job_cnt);
+			} else if ((max_backfill_job_per_user != 0) &&
+				   (max_backfill_job_per_user >=
+				    max_backfill_job_cnt)) {
+				error("bf_max_job_user >= bf_max_job_test (%u >= %u)",
+				      max_backfill_job_per_user,
+				      max_backfill_job_cnt);
+			}
 			break;
 		}
 
-- 
GitLab