From f636c4562a149a784682a3d5753b3f1762a8dff1 Mon Sep 17 00:00:00 2001
From: Michael Hinton <hinton@schedmd.com>
Date: Tue, 6 Apr 2021 13:17:37 -0600
Subject: [PATCH] Never schedule the last task in a job array twice

The last task reuses array_job_id. So if job_ptr doesn't change, that
means we have already scheduled it and should never go to next_task.

job_launch() can requeue the job if it fails. This puts the job in the
completing and pending state, which could allow the last task to get
scheduled *again* without the check added here. Rescheduling a
completing job will destroy its node_bitmap and job_resrcs and cause
the job to stay completing forever.

Bug 10980

Co-authored-by: Brian Christiansen <brian@schedmd.com>
---
 src/plugins/sched/backfill/backfill.c | 3 ++-
 src/slurmctld/job_scheduler.c         | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c
index 39b4770dfd1..010fb3b939f 100644
--- a/src/plugins/sched/backfill/backfill.c
+++ b/src/plugins/sched/backfill/backfill.c
@@ -2502,9 +2502,10 @@ skip_start:
 				if (is_job_array_head &&
 				    (job_ptr->array_task_id != NO_VAL)) {
 					/* Try starting next task of job array */
+					job_record_t *tmp = job_ptr;
 					job_ptr = find_job_record(job_ptr->
 								  array_job_id);
-					if (job_ptr &&
+					if (job_ptr && (job_ptr != tmp) &&
 					    IS_JOB_PENDING(job_ptr) &&
 					    (bb_g_job_test_stage_in(
 						    job_ptr, false) == 1))
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index 6597707127e..dabf2632101 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -1842,8 +1842,10 @@ skip_start:
 			if (is_job_array_head &&
 			    (job_ptr->array_task_id != NO_VAL)) {
 				/* Try starting another task of the job array */
+				job_record_t *tmp = job_ptr;
 				job_ptr = find_job_record(job_ptr->array_job_id);
-				if (job_ptr && IS_JOB_PENDING(job_ptr) &&
+				if (job_ptr && (job_ptr != tmp) &&
+				    IS_JOB_PENDING(job_ptr) &&
 				    (bb_g_job_test_stage_in(job_ptr,false) ==1))
 					goto next_task;
 			}
-- 
GitLab