From d72b13f220a10869618f2cf1267c46dcdad8b745 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Mon, 23 Jan 2017 10:53:18 -0700
Subject: [PATCH] Fix for backfill launch job with reboot

This bug was likely the root cause of bug 3366. If the backfill scheduler
  allocates resources for a batch job and a node reboot is required, the
  batch launch RPC would be sent to the agent. At that point, there is a
  race condition between the agent and the job_time_limit() function
  testing for boot completion. If the job_time_limit() function ran
  first, it would trigger a second launch RPC request getting sent to
  the agent.
bug 3366
---
 src/plugins/sched/backfill/backfill.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c
index 9892a77130a..23339edab16 100644
--- a/src/plugins/sched/backfill/backfill.c
+++ b/src/plugins/sched/backfill/backfill.c
@@ -1856,8 +1856,7 @@ static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap)
 		power_g_job_start(job_ptr);
 		if (job_ptr->batch_flag == 0)
 			srun_allocate(job_ptr->job_id);
-		else if ((job_ptr->details == NULL) ||
-			 (job_ptr->details->prolog_running == 0))
+		else if (!IS_JOB_CONFIGURING(job_ptr))
 			launch_job(job_ptr);
 		slurmctld_diag_stats.backfilled_jobs++;
 		slurmctld_diag_stats.last_backfilled_jobs++;
-- 
GitLab