From d72b13f220a10869618f2cf1267c46dcdad8b745 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Mon, 23 Jan 2017 10:53:18 -0700 Subject: [PATCH] Fix for backfill launch job with reboot This bug was likely the root cause of bug 3366. If the backfill scheduler allocates resources for a batch job and a node reboot is required, the batch launch RPC would be sent to the agent. At that point, there is a race condition between the agent and the job_time_limit() function testing for boot completion. If the job_time_limit() function ran first, it would trigger a second launch RPC request getting sent to the agent. bug 3366 --- src/plugins/sched/backfill/backfill.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 9892a77130a..23339edab16 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -1856,8 +1856,7 @@ static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap) power_g_job_start(job_ptr); if (job_ptr->batch_flag == 0) srun_allocate(job_ptr->job_id); - else if ((job_ptr->details == NULL) || - (job_ptr->details->prolog_running == 0)) + else if (!IS_JOB_CONFIGURING(job_ptr)) launch_job(job_ptr); slurmctld_diag_stats.backfilled_jobs++; slurmctld_diag_stats.last_backfilled_jobs++; -- GitLab