From 4d43c7f89255c33c40da3bf9c1fd6413c1d3549e Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Tue, 15 Jul 2014 15:50:37 -0700
Subject: [PATCH] Refactor job suspend/resume logic

For improved support of job arrays
---
 src/slurmctld/job_mgr.c | 136 +++++++++++++++++++++-------------------
 1 file changed, 71 insertions(+), 65 deletions(-)

diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 136052e8371..0436f978026 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -11109,95 +11109,58 @@ static int _job_resume_test(struct job_record *job_ptr)
 }
 
 /*
- * job_suspend - perform some suspend/resume operation
- * IN sus_ptr - suspend/resume request message
- * IN uid - user id of the user issuing the RPC
- * IN conn_fd - file descriptor on which to send reply,
- *              -1 if none
+ * _job_suspend - perform some suspend/resume operation
+ * job_ptr - job to operate upon
+ * op IN - operation: suspend/resume
  * indf_susp IN - set if job is being suspended indefinitely by user or admin
  *                and we should clear it's priority, otherwise suspended
  *		  temporarily for gang scheduling
- * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
-extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid,
-		       slurm_fd_t conn_fd, bool indf_susp,
-		       uint16_t protocol_version)
+static int _job_suspend(struct job_record *job_ptr, uint16_t op, bool indf_susp)
 {
 	int rc = SLURM_SUCCESS;
 	time_t now = time(NULL);
-	struct job_record *job_ptr = NULL;
-	slurm_msg_t resp_msg;
-	return_code_msg_t rc_msg;
-
-#ifdef HAVE_BG
-	rc = ESLURM_NOT_SUPPORTED;
-	goto reply;
-#endif
-
-	/* find the job */
-	job_ptr = find_job_record (sus_ptr->job_id);
-	if (job_ptr == NULL) {
-		rc = ESLURM_INVALID_JOB_ID;
-		goto reply;
-	}
 
-	/* validate the request */
-	if ((uid != 0) && (uid != getuid())) {
-		rc = ESLURM_ACCESS_DENIED;
-		goto reply;
-	}
-	if (IS_JOB_PENDING(job_ptr)) {
-		rc = ESLURM_JOB_PENDING;
-		goto reply;
-	}
-	if (IS_JOB_FINISHED(job_ptr)) {
-		rc = ESLURM_ALREADY_DONE;
-		goto reply;
-	}
-	if ((sus_ptr->op == SUSPEND_JOB) &&
-	    (_job_suspend_switch_test(job_ptr) != SLURM_SUCCESS)) {
-		rc = ESLURM_NOT_SUPPORTED;
-		goto reply;
-	}
-	if ((sus_ptr->op == RESUME_JOB) && (rc = _job_resume_test(job_ptr)))
-		goto reply;
+	if (IS_JOB_PENDING(job_ptr))
+		return ESLURM_JOB_PENDING;
+	if (IS_JOB_FINISHED(job_ptr))
+		return ESLURM_ALREADY_DONE;
+	if ((op == SUSPEND_JOB) &&
+	    (_job_suspend_switch_test(job_ptr) != SLURM_SUCCESS))
+		return ESLURM_NOT_SUPPORTED;
+	if ((op == RESUME_JOB) && (rc = _job_resume_test(job_ptr)))
+		return rc;
 
 	/* Notify salloc/srun of suspend/resume */
-	srun_job_suspend(job_ptr, sus_ptr->op);
+	srun_job_suspend(job_ptr, op);
 
 	/* perform the operation */
-	if (sus_ptr->op == SUSPEND_JOB) {
-		if (!IS_JOB_RUNNING(job_ptr)) {
-			rc = ESLURM_JOB_NOT_RUNNING;
-			goto reply;
-		}
+	if (op == SUSPEND_JOB) {
+		if (!IS_JOB_RUNNING(job_ptr))
+			return ESLURM_JOB_NOT_RUNNING;
 		rc = _suspend_job_nodes(job_ptr, indf_susp);
 		if (rc != SLURM_SUCCESS)
-			goto reply;
-		_suspend_job(job_ptr, sus_ptr->op, indf_susp);
+			return rc;
+		_suspend_job(job_ptr, op, indf_susp);
 		job_ptr->job_state = JOB_SUSPENDED;
 		if (indf_susp)
 			job_ptr->priority = 0;
 		if (job_ptr->suspend_time) {
 			job_ptr->pre_sus_time +=
-				difftime(now,
-				job_ptr->suspend_time);
+				difftime(now, job_ptr->suspend_time);
 		} else {
 			job_ptr->pre_sus_time +=
-				difftime(now,
-				job_ptr->start_time);
+				difftime(now, job_ptr->start_time);
 		}
 		suspend_job_step(job_ptr);
-	} else if (sus_ptr->op == RESUME_JOB) {
-		if (!IS_JOB_SUSPENDED(job_ptr)) {
-			rc = ESLURM_JOB_NOT_SUSPENDED;
-			goto reply;
-		}
+	} else if (op == RESUME_JOB) {
+		if (!IS_JOB_SUSPENDED(job_ptr))
+			return ESLURM_JOB_NOT_SUSPENDED;
 		rc = _resume_job_nodes(job_ptr, indf_susp);
 		if (rc != SLURM_SUCCESS)
-			goto reply;
-		_suspend_job(job_ptr, sus_ptr->op, indf_susp);
+			return rc;
+		_suspend_job(job_ptr, op, indf_susp);
 		if (job_ptr->priority == 0)
 			set_job_prio(job_ptr);
 		job_ptr->job_state = JOB_RUNNING;
@@ -11218,8 +11181,7 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid,
 		    (!job_ptr->preempt_time)) {
  			debug3("Job %u resumed, updating end_time",
  			       job_ptr->job_id);
-			job_ptr->end_time = now +
-				(job_ptr->time_limit * 60)
+			job_ptr->end_time = now + (job_ptr->time_limit * 60)
 				- job_ptr->pre_sus_time;
 		}
 		resume_job_step(job_ptr);
@@ -11229,6 +11191,50 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid,
 	job_ptr->suspend_time = now;
 	jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
 
+	return rc;
+}
+
+/*
+ * job_suspend - perform some suspend/resume operation
+ * IN sus_ptr - suspend/resume request message
+ * IN uid - user id of the user issuing the RPC
+ * IN conn_fd - file descriptor on which to send reply,
+ *              -1 if none
+ * indf_susp IN - set if job is being suspended indefinitely by user or admin
+ *                and we should clear it's priority, otherwise suspended
+ *		  temporarily for gang scheduling
+ * IN protocol_version - slurm protocol version of client
+ * RET 0 on success, otherwise ESLURM error code
+ */
+extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid,
+		       slurm_fd_t conn_fd, bool indf_susp,
+		       uint16_t protocol_version)
+{
+	int rc = SLURM_SUCCESS;
+	struct job_record *job_ptr = NULL;
+	slurm_msg_t resp_msg;
+	return_code_msg_t rc_msg;
+
+#ifdef HAVE_BG
+	rc = ESLURM_NOT_SUPPORTED;
+	goto reply;
+#endif
+
+	/* find the job */
+	job_ptr = find_job_record (sus_ptr->job_id);
+	if (job_ptr == NULL) {
+		rc = ESLURM_INVALID_JOB_ID;
+		goto reply;
+	}
+
+	/* validate the request */
+	if ((uid != 0) && (uid != getuid())) {
+		rc = ESLURM_ACCESS_DENIED;
+		goto reply;
+	}
+
+	rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
+
     reply:
 	if (conn_fd >= 0) {
 		slurm_msg_t_init(&resp_msg);
-- 
GitLab