From b493c5dc50e3bbca9f4369ae475a5e9bbafc53b7 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Wed, 11 Jun 2014 13:15:23 -0700
Subject: [PATCH] backfill - continue after failed job start

When a decision is made to start a job, if for some
reason that job's start failed, the backfill scheduler
would previously just exit. With this change, it logs
the event and reserves the resources expected to be
used and continues down the job queue.
---
 src/plugins/sched/backfill/backfill.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c
index 82e48e1c265..a9e57a3fecd 100644
--- a/src/plugins/sched/backfill/backfill.c
+++ b/src/plugins/sched/backfill/backfill.c
@@ -1088,12 +1088,12 @@ static int _attempt_backfill(void)
 				job_ptr->start_time = 0;
 				continue;
 			} else if (rc != SLURM_SUCCESS) {
-				if (debug_flags & DEBUG_FLAG_BACKFILL) {
-					info("backfill: planned start of job "
-					     "%u failed", job_ptr->job_id);
-				}
-				job_ptr->start_time = 0;
-				break;
+				error("backfill: planned start of job %u "
+				      "failed: %s", job_ptr->job_id,
+				      slurm_strerror(rc));
+				/* Drop through and reserve these resources */
+				job_ptr->time_limit = orig_time_limit;
+				later_start = 0;
 			} else {
 				/* Started this job, move to next one */
 				reject_array_job_id = 0;
@@ -1147,7 +1147,8 @@ static int _attempt_backfill(void)
 			      backfill_resolution;
 		end_reserve = (end_reserve / backfill_resolution) *
 			      backfill_resolution;
-		if (_test_resv_overlap(node_space, avail_bitmap,
+		if ((job_ptr->start_time > now) &&
+		    _test_resv_overlap(node_space, avail_bitmap,
 				       start_time, end_reserve)) {
 			/* This job overlaps with an existing reservation for
 			 * job to be backfill scheduled, which the sched
-- 
GitLab