From b493c5dc50e3bbca9f4369ae475a5e9bbafc53b7 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Wed, 11 Jun 2014 13:15:23 -0700 Subject: [PATCH] backfill - continue after failed job start When a decision is made to start a job, if for some reason that job's start failed, the backfill scheduler would previously just exit. With this change, it logs the event and reserves the resources expected to be used and continues down the job queue. --- src/plugins/sched/backfill/backfill.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 82e48e1c265..a9e57a3fecd 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -1088,12 +1088,12 @@ static int _attempt_backfill(void) job_ptr->start_time = 0; continue; } else if (rc != SLURM_SUCCESS) { - if (debug_flags & DEBUG_FLAG_BACKFILL) { - info("backfill: planned start of job " - "%u failed", job_ptr->job_id); - } - job_ptr->start_time = 0; - break; + error("backfill: planned start of job %u " + "failed: %s", job_ptr->job_id, + slurm_strerror(rc)); + /* Drop through and reserve these resources */ + job_ptr->time_limit = orig_time_limit; + later_start = 0; } else { /* Started this job, move to next one */ reject_array_job_id = 0; @@ -1147,7 +1147,8 @@ static int _attempt_backfill(void) backfill_resolution; end_reserve = (end_reserve / backfill_resolution) * backfill_resolution; - if (_test_resv_overlap(node_space, avail_bitmap, + if ((job_ptr->start_time > now) && + _test_resv_overlap(node_space, avail_bitmap, start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched -- GitLab