From 9a2d863c07eadc78e0dd3c43e3818f4745328954 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Fri, 28 Feb 2014 11:05:14 -0800 Subject: [PATCH] select/cray step cleanup If the slurm.conf changes to remove nodes from a job step in the process of running Node Health Check, the step record would be purged. Then when the NHC completes, it's step record pointer would be invalid. The change here will prevent such a job step record from being purged. This was found while investigating bug 612, but would be a very unusual condition and does not fix this bug. --- src/slurmctld/step_mgr.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 06eb37bb41c..b6f988c6674 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -308,24 +308,31 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id) ListIterator step_iterator; struct step_record *step_ptr; int error_code; + uint16_t cleaning = 0; xassert(job_ptr); error_code = ENOENT; if (!job_ptr->step_list) return error_code; - step_iterator = list_iterator_create (job_ptr->step_list); last_job_update = time(NULL); + step_iterator = list_iterator_create (job_ptr->step_list); while ((step_ptr = (struct step_record *) list_next (step_iterator))) { - if (step_ptr->step_id == step_id) { - list_remove (step_iterator); - _free_step_rec(step_ptr); - error_code = 0; + if (step_ptr->step_id != step_id) + continue; + + error_code = 0; + select_g_select_jobinfo_get(step_ptr->select_jobinfo, + SELECT_JOBDATA_CLEANING, + &cleaning); + if (!cleaning) /* Step clean-up already in progress. */ break; - } + list_remove(step_iterator); + _free_step_rec(step_ptr); + break; } + list_iterator_destroy(step_iterator); - list_iterator_destroy (step_iterator); return error_code; } -- GitLab