From 491f717bacd505acf76dceb61fcab6a9a84eee4f Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 5 May 2011 08:40:22 -0700
Subject: [PATCH] Disable expansion of suspended job

If gang scheduling is configured, then expanding a job can only proceed
if both jobs execute in the same time slice. For now, just disable job
expansion if either job involved in the expansion is suspended. This
limitiation might be removed in the future.
---
 slurm/slurm_errno.h      | 15 ++++++++-------
 src/common/slurm_errno.c |  2 ++
 src/slurmctld/job_mgr.c  | 13 ++++++++++---
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h
index 838b9cf3f65..d49d00f5d81 100644
--- a/slurm/slurm_errno.h
+++ b/slurm/slurm_errno.h
@@ -121,7 +121,7 @@ enum {
 	ESLURM_ERROR_ON_DESC_TO_RECORD_COPY,
 	ESLURM_JOB_MISSING_SIZE_SPECIFICATION,
 	ESLURM_JOB_SCRIPT_MISSING,
-	ESLURM_USER_ID_MISSING,
+	ESLURM_USER_ID_MISSING =			2010,
 	ESLURM_DUPLICATE_JOB_ID,
 	ESLURM_PATHNAME_TOO_LONG,
 	ESLURM_NOT_TOP_PRIORITY,
@@ -131,7 +131,7 @@ enum {
 	ESLURM_INVALID_JOB_ID,
 	ESLURM_INVALID_NODE_NAME,
 	ESLURM_WRITING_TO_FILE,
-	ESLURM_TRANSITION_STATE_NO_UPDATE,
+	ESLURM_TRANSITION_STATE_NO_UPDATE =		2020,
 	ESLURM_ALREADY_DONE,
 	ESLURM_INTERCONNECT_FAILURE,
 	ESLURM_BAD_DIST,
@@ -141,7 +141,7 @@ enum {
 	ESLURM_IN_STANDBY_MODE,
 	ESLURM_INVALID_NODE_STATE,
 	ESLURM_INVALID_FEATURE,
-	ESLURM_INVALID_AUTHTYPE_CHANGE,
+	ESLURM_INVALID_AUTHTYPE_CHANGE =		2030,
 	ESLURM_INVALID_CHECKPOINT_TYPE_CHANGE,
 	ESLURM_INVALID_SCHEDTYPE_CHANGE,
 	ESLURM_INVALID_SELECTTYPE_CHANGE,
@@ -151,7 +151,7 @@ enum {
 	ESLURM_DISABLED,
 	ESLURM_DEPENDENCY,
 	ESLURM_BATCH_ONLY,
-	ESLURM_TASKDIST_ARBITRARY_UNSUPPORTED,
+	ESLURM_TASKDIST_ARBITRARY_UNSUPPORTED =		2040,
 	ESLURM_TASKDIST_REQUIRES_OVERCOMMIT,
 	ESLURM_JOB_HELD,
 	ESLURM_INVALID_CRYPTO_TYPE_CHANGE,
@@ -161,7 +161,7 @@ enum {
 	ESLURM_SAME_PARENT_ACCOUNT,
 	ESLURM_INVALID_LICENSES,
 	ESLURM_NEED_RESTART,
-	ESLURM_ACCOUNTING_POLICY,
+	ESLURM_ACCOUNTING_POLICY =			2050,
 	ESLURM_INVALID_TIME_LIMIT,
 	ESLURM_RESERVATION_ACCESS,
 	ESLURM_RESERVATION_INVALID,
@@ -171,7 +171,7 @@ enum {
 	ESLURM_INVALID_WCKEY,
 	ESLURM_RESERVATION_OVERLAP,
 	ESLURM_PORTS_BUSY,
-	ESLURM_PORTS_INVALID,
+	ESLURM_PORTS_INVALID =				2060,
 	ESLURM_PROLOG_RUNNING,
 	ESLURM_NO_STEPS,
 	ESLURM_INVALID_BLOCK_STATE,
@@ -181,7 +181,7 @@ enum {
 	ESLURM_QOS_PREEMPTION_LOOP,
 	ESLURM_NODE_NOT_AVAIL,
 	ESLURM_INVALID_CPU_COUNT,
-	ESLURM_PARTITION_NOT_AVAIL,
+	ESLURM_PARTITION_NOT_AVAIL =			2070,
 	ESLURM_CIRCULAR_DEPENDENCY,
 	ESLURM_INVALID_GRES,
 	ESLURM_JOB_NOT_PENDING,
@@ -189,6 +189,7 @@ enum {
 	ESLURM_PARTITION_IN_USE,
 	ESLURM_EXPAND_GRES,
 	ESLURM_STEP_LIMIT,
+	ESLURM_JOB_SUSPENDED,
 
 	/* switch specific error codes, specific values defined in plugin module */
 	ESLURM_SWITCH_MIN = 3000,
diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c
index 4c948acbb11..f2b584ac0ba 100644
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -280,6 +280,8 @@ static slurm_errtab_t slurm_errtab[] = {
 	  "Job expansion with generic resource (gres) not supported"	},
 	{ ESLURM_STEP_LIMIT,
 	  "Step limit reached for this job"			},
+	{ ESLURM_JOB_SUSPENDED,
+	  "Job is current suspended, requested operation disabled"	},
 
 	/* slurmd error codes */
 
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index dfd05985d09..d35bdd14bc3 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -6859,7 +6859,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 	/* Reset min and max node counts as needed, insure consistency */
 	if (job_specs->min_nodes != NO_VAL) {
 		if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
-			;	/* shrink running job, handle later */
+			;	/* shrink running job, processed later */
 		else if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (job_specs->min_nodes < 1) {
@@ -7344,8 +7344,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
 		/* Use req_nodes to change the nodes associated with a running
 		 * for lack of other field in the job request to use */
-		if ((job_specs->min_nodes == 0) && IS_JOB_RUNNING(job_ptr) &&
-		    (job_ptr->node_cnt > 0) &&
+		if ((job_specs->min_nodes == 0) && (job_ptr->node_cnt > 0) &&
 		    job_ptr->details && job_ptr->details->expanding_jobid) {
 			struct job_record *expand_job_ptr;
 			bitstr_t *orig_job_node_bitmap;
@@ -7360,6 +7359,14 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 				error_code = ESLURM_INVALID_JOB_ID;
 				goto fini;
 			}
+			if (IS_JOB_SUSPENDED(job_ptr) ||
+			    IS_JOB_SUSPENDED(expand_job_ptr)) {
+				info("Can not expand job %u from job %u, "
+				     "job is suspended",
+				     expand_job_ptr->job_id, job_ptr->job_id);
+				error_code = ESLURM_JOB_SUSPENDED;
+				goto fini;
+			}
 			if ((job_ptr->step_list != NULL) &&
 			    (list_count(job_ptr->step_list) != 0)) {
 				info("Attempt to merge job %u with active "
-- 
GitLab