From 7734d9c0a057420c73e5ee8097c3c2b04ee9e0ea Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 11 Aug 2009 20:54:30 +0000
Subject: [PATCH] avoid resuming a job in gang that was not suspended by gang
 (e.g. admin action). add some more logging

---
 src/slurmctld/gang.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c
index 58c3e411cb4..ea4d2ad8b39 100644
--- a/src/slurmctld/gang.c
+++ b/src/slurmctld/gang.c
@@ -1430,6 +1430,8 @@ extern int gs_reconfig(void)
 			for (i = 0; i < p_ptr->num_jobs; i++) {
 				if (p_ptr->job_list[i]->sig_state == 
 				    GS_SUSPEND) {
+					info("resuming job in missing part %s",
+					     p_ptr->part_name);
 					_resume_job(p_ptr->job_list[i]->
 						   job_id);
 					p_ptr->job_list[i]->sig_state = 
@@ -1447,8 +1449,8 @@ extern int gs_reconfig(void)
 		 * necessary). NOTE: there could be jobs that only overlap
 		 * on nodes that are no longer in the partition, but we're
 		 * not going to worry about those cases.
-		 */
-		/* add the jobs from p_ptr into new_ptr in their current order
+		 *
+		 * add the jobs from p_ptr into new_ptr in their current order
 		 * to preserve the state of timeslicing.
 		 */
 		for (i = 0; i < p_ptr->num_jobs; i++) {
@@ -1457,9 +1459,13 @@ extern int gs_reconfig(void)
 				/* job no longer exists in SLURM, so drop it */
 				continue;
 			}
-			/* resume any job that is suspended */
-			if (IS_JOB_SUSPENDED(job_ptr))
+			/* resume any job that is suspended by us */
+			if (IS_JOB_SUSPENDED(job_ptr) && 
+			    (job_ptr->priority != 0)) {
+				debug3("resuming job %u apparently suspended "
+				       " by gang", job_ptr->job_id);
 				_resume_job(job_ptr->job_id);
+			}
 
 			/* transfer the job as long as it is still active */
 			if (IS_JOB_SUSPENDED(job_ptr) ||
-- 
GitLab