From 9a2d863c07eadc78e0dd3c43e3818f4745328954 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Fri, 28 Feb 2014 11:05:14 -0800
Subject: [PATCH] select/cray step cleanup

If the slurm.conf changes to remove nodes from a job step in the
process of running Node Health Check, the step record would be
purged. Then when the NHC completes, it's step record pointer
would be invalid.
The change here will prevent such a job step record from being
purged.
This was found while investigating bug 612, but would be a very
unusual condition and does not fix this bug.
---
 src/slurmctld/step_mgr.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 06eb37bb41c..b6f988c6674 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -308,24 +308,31 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id)
 	ListIterator step_iterator;
 	struct step_record *step_ptr;
 	int error_code;
+	uint16_t cleaning = 0;
 
 	xassert(job_ptr);
 	error_code = ENOENT;
 	if (!job_ptr->step_list)
 		return error_code;
 
-	step_iterator = list_iterator_create (job_ptr->step_list);
 	last_job_update = time(NULL);
+	step_iterator = list_iterator_create (job_ptr->step_list);
 	while ((step_ptr = (struct step_record *) list_next (step_iterator))) {
-		if (step_ptr->step_id == step_id) {
-			list_remove (step_iterator);
-			_free_step_rec(step_ptr);
-			error_code = 0;
+		if (step_ptr->step_id != step_id)
+			continue;
+
+		error_code = 0;
+		select_g_select_jobinfo_get(step_ptr->select_jobinfo,
+					    SELECT_JOBDATA_CLEANING,
+					    &cleaning);
+		if (!cleaning)	/* Step clean-up already in progress. */
 			break;
-		}
+		list_remove(step_iterator);
+		_free_step_rec(step_ptr);
+		break;
 	}
+	list_iterator_destroy(step_iterator);
 
-	list_iterator_destroy (step_iterator);
 	return error_code;
 }
 
-- 
GitLab