From f636c4562a149a784682a3d5753b3f1762a8dff1 Mon Sep 17 00:00:00 2001 From: Michael Hinton <hinton@schedmd.com> Date: Tue, 6 Apr 2021 13:17:37 -0600 Subject: [PATCH] Never schedule the last task in a job array twice The last task reuses array_job_id. So if job_ptr doesn't change, that means we have already scheduled it and should never go to next_task. job_launch() can requeue the job if it fails. This puts the job in the completing and pending state, which could allow the last task to get scheduled *again* without the check added here. Rescheduling a completing job will destroy its node_bitmap and job_resrcs and cause the job to stay completing forever. Bug 10980 Co-authored-by: Brian Christiansen <brian@schedmd.com> --- src/plugins/sched/backfill/backfill.c | 3 ++- src/slurmctld/job_scheduler.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 39b4770dfd1..010fb3b939f 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -2502,9 +2502,10 @@ skip_start: if (is_job_array_head && (job_ptr->array_task_id != NO_VAL)) { /* Try starting next task of job array */ + job_record_t *tmp = job_ptr; job_ptr = find_job_record(job_ptr-> array_job_id); - if (job_ptr && + if (job_ptr && (job_ptr != tmp) && IS_JOB_PENDING(job_ptr) && (bb_g_job_test_stage_in( job_ptr, false) == 1)) diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 6597707127e..dabf2632101 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -1842,8 +1842,10 @@ skip_start: if (is_job_array_head && (job_ptr->array_task_id != NO_VAL)) { /* Try starting another task of the job array */ + job_record_t *tmp = job_ptr; job_ptr = find_job_record(job_ptr->array_job_id); - if (job_ptr && IS_JOB_PENDING(job_ptr) && + if (job_ptr && (job_ptr != tmp) && + IS_JOB_PENDING(job_ptr) && (bb_g_job_test_stage_in(job_ptr,false) ==1)) goto next_task; } -- GitLab