From 5a8dbc3904c4e1acf9e128e21750de7796ee7342 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 16 Dec 2005 17:21:21 +0000
Subject: [PATCH] Change suspend/resume from job step to job granularity.

---
 doc/man/man1/scontrol.1          | 14 +++-------
 doc/man/man3/slurm_resume.3      | 18 ++++---------
 slurm/slurm.h.in                 | 10 +++----
 src/api/suspend.c                | 23 +++++++---------
 src/common/slurm_protocol_defs.h |  5 ++--
 src/common/slurm_protocol_pack.c |  2 --
 src/scontrol/scontrol.c          | 31 ++++++++++------------
 src/slurmctld/job_mgr.c          | 45 ++++++++++++++++++++++++++++++++
 src/slurmctld/proc_req.c         | 24 +++++------------
 src/slurmctld/slurmctld.h        |  4 +--
 src/slurmctld/step_mgr.c         | 45 --------------------------------
 11 files changed, 92 insertions(+), 129 deletions(-)

diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1
index 5d74c8f405d..a9160d9bd58 100644
--- a/doc/man/man1/scontrol.1
+++ b/doc/man/man1/scontrol.1
@@ -126,11 +126,8 @@ parameters are to be changed: AuthType, BackupAddr, BackupController,
 ControlAddr, ControlMach, PluginDir, StateSaveLocation, SlurmctldPort 
 or SlurmdPort.
 .TP
-\fIresume\fP \fIID\fP
-Resume a previously suspended job or job step.
-\fIID\fP can be used to identify a specific job (e.g. "<job_id>",
-which applies to all of its existing steps)
-or a specific job step (e.g. "<job_id>.<step_id>").
+\fIresume\fP \fIjob_id\fP
+Resume a previously suspended job.
 .TP
 \fIshow\fP \fIENTITY\fP \fIID\fP
 Display the state of the specified entity with the specified identification.
@@ -150,12 +147,9 @@ Instruct all Slurm daemons to save current state and terminate.
 The Slurm controller (slurmctld) forwards the request all other daemons 
 (slurmd daemon on each compute node). 
 .TP
-\fIsuspend\fP \fIID\fP
-Suspend a running job or job step. 
+\fIsuspend\fP \fIjob_id\fP
+Suspend a running job.
 Use the \fIresume\fP command to resume its execution.
-\fIID\fP can be used to identify a specific job (e.g. "<job_id>",
-which applies to all of its existing steps)
-or a specific job step (e.g. "<job_id>.<step_id>").
 .TP
 \fIupdate\fP \fISPECIFICATION\fP 
 Update job, node or partition configuration per the supplied specification.
diff --git a/doc/man/man3/slurm_resume.3 b/doc/man/man3/slurm_resume.3
index ce247242b95..b6b18dc90e7 100644
--- a/doc/man/man3/slurm_resume.3
+++ b/doc/man/man3/slurm_resume.3
@@ -10,17 +10,13 @@ slurm_suspend, slurm_resume \- Slurm suspend and resume functions
 .LP
 int \fBslurm_suspend\fR (
 .br
-	uint32_t \fIjob_id\fP,
-.br
-	uint32_t \fIstep_id\fP
+	uint32_t \fIjob_id\fP
 .br
 );
 .LP
 int \fBslurm_resume\fR (
 .br
-	uint32_t \fIjob_id\fP,
-.br
-	uint32_t \fIstep_id\fP
+	uint32_t \fIjob_id\fP
 .br
 );
 
@@ -29,18 +25,14 @@ int \fBslurm_resume\fR (
 .TP
 \fIjob_id\fP
 SLURM job ID to perform the operation upon.
-.TP
-\fIstep_id\fP
-SLURM job step ID to perform the operation upon. 
-May be NO_VAL if the operation is to be performed on all steps of the specified job.
 
 .SH "DESCRIPTION"
 .LP
 \fBslurm_suspend\fR
-Suspend the specified job or job step.
+Suspend the specified job.
 .LP
 \fBslurm_resume\fR
-Resume execution of a previously suspended job or job step.
+Resume execution of a previously suspended job.
 
 .SH "RETURN VALUE"
 .LP
@@ -48,7 +40,7 @@ Zero is returned upon success.
 On error, -1 is returned, and the Slurm error code is set appropriately.
 .SH "ERRORS"
 .LP
-\fBESLURM_INVALID_JOB_ID\fR the requested job or job step id does not exist. 
+\fBESLURM_INVALID_JOB_ID\fR the requested job id does not exist. 
 .LP
 \fBESLURM_ACCESS_DENIED\fR the requesting user lacks authorization for the requested 
 action (e.g. trying to delete or modify another user's job). 
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index de3912f874b..d796b63f6d3 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -1234,20 +1234,18 @@ extern int slurm_shutdown PARAMS(( uint16_t core ));
 \*****************************************************************************/
 
 /*
- * slurm_suspend - suspend execution of a job step.
+ * slurm_suspend - suspend execution of a job.
  * IN job_id  - job on which to perform operation
- * IN step_id - job step on which to perform operation
  * RET 0 or a slurm error code
  */
-extern int slurm_suspend PARAMS(( uint32_t job_id, uint32_t step_id ));
+extern int slurm_suspend PARAMS(( uint32_t job_id ));
 
 /*
- * slurm_resume - resume execution of a previously suspended job step.
+ * slurm_resume - resume execution of a previously suspended job.
  * IN job_id  - job on which to perform operation
- * IN step_id - job step on which to perform operation
  * RET 0 or a slurm error code
  */
-extern int slurm_resume PARAMS(( uint32_t job_id, uint32_t step_id ));
+extern int slurm_resume PARAMS(( uint32_t job_id ));
 
 /*****************************************************************************\
  *      SLURM JOB CHECKPOINT FUNCTIONS
diff --git a/src/api/suspend.c b/src/api/suspend.c
index ddf2790a9d1..a04adf756ac 100644
--- a/src/api/suspend.c
+++ b/src/api/suspend.c
@@ -32,15 +32,15 @@
 #include <slurm/slurm.h>
 #include "src/common/slurm_protocol_api.h"
 
-static int _suspend_op (uint16_t op, uint32_t job_id, uint32_t step_id);
+static int _suspend_op (uint16_t op, uint32_t job_id);
 /*
- * _suspend_op - perform a suspend/resume operation for some job step.
+ * _suspend_op - perform a suspend/resume operation for some job.
  * IN op      - operation to perform
  * IN job_id  - job on which to perform operation
  * IN step_id - job step on which to perform operation
  * RET 0 or a slurm error code
  */
-static int _suspend_op (uint16_t op, uint32_t job_id, uint32_t step_id)
+static int _suspend_op (uint16_t op, uint32_t job_id)
 {
 	int rc;
 	checkpoint_msg_t sus_req;
@@ -48,7 +48,6 @@ static int _suspend_op (uint16_t op, uint32_t job_id, uint32_t step_id)
 
 	sus_req.op       = op;
 	sus_req.job_id   = job_id;
-	sus_req.step_id  = step_id;
 	req_msg.msg_type = REQUEST_SUSPEND;
 	req_msg.data     = &sus_req;
 
@@ -60,25 +59,21 @@ static int _suspend_op (uint16_t op, uint32_t job_id, uint32_t step_id)
 }
 
 /*
- * slurm_suspend - suspend execution of a job step.
+ * slurm_suspend - suspend execution of a job.
  * IN job_id  - job on which to perform operation
- * IN step_id - job step on which to perform operation or NO_VAL 
- *		for all of the job's steps
  * RET 0 or a slurm error code
  */
-extern int slurm_suspend (uint32_t job_id, uint32_t step_id)
+extern int slurm_suspend (uint32_t job_id)
 {
-	return _suspend_op (SUSPEND_STEP, job_id, step_id);
+	return _suspend_op (SUSPEND_JOB, job_id);
 }
 
 /*
- * slurm_resume - resume execution of a previously suspended job step.
+ * slurm_resume - resume execution of a previously suspended job.
  * IN job_id  - job on which to perform operation
- * IN step_id - job step on which to perform operation or NO_VAL 
- *		for all of the job's steps
  * RET 0 or a slurm error code
  */
-extern int slurm_resume (uint32_t job_id, uint32_t step_id)
+extern int slurm_resume (uint32_t job_id)
 {
-	return _suspend_op (RESUME_STEP, job_id, step_id);
+	return _suspend_op (RESUME_JOB, job_id);
 }
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index ae9f88618bf..a10462dd1d4 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -66,8 +66,8 @@ enum part_shared {
 };
 
 enum suspend_opts {
-	SUSPEND_STEP,		/* Suspend a job step now */
-	RESUME_STEP		/* Resume a job step now */
+	SUSPEND_JOB,		/* Suspend a job now */
+	RESUME_JOB		/* Resume a job now */
 };
 
 /* SLURM Message types */
@@ -466,7 +466,6 @@ typedef struct checkpoint_resp_msg {
 typedef struct suspend_msg {
 	uint16_t op;            /* suspend operation, see enum suspend_opts */
 	uint32_t job_id;        /* slurm job_id */
-	uint32_t step_id;       /* slurm step_id */
 } suspend_msg_t;
 
 typedef struct jobacct_msg {
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index 179fc186465..18011f0fb33 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -3226,7 +3226,6 @@ static void _pack_suspend_msg(suspend_msg_t *msg, Buf buffer)
 
 	pack16 ( msg -> op,      buffer ) ;
 	pack32 ( msg -> job_id,  buffer ) ;
-	pack32 ( msg -> step_id, buffer ) ;
 }
 
 static int  _unpack_suspend_msg(suspend_msg_t **msg_ptr, Buf buffer)
@@ -3239,7 +3238,6 @@ static int  _unpack_suspend_msg(suspend_msg_t **msg_ptr, Buf buffer)
 
 	safe_unpack16 ( & msg -> op ,      buffer ) ;
 	safe_unpack32 ( & msg -> job_id  , buffer ) ;
-	safe_unpack32 ( & msg -> step_id , buffer ) ;
 	return SLURM_SUCCESS;
 
     unpack_error:
diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c
index e879f868122..b7f11852ada 100644
--- a/src/scontrol/scontrol.c
+++ b/src/scontrol/scontrol.c
@@ -110,7 +110,7 @@ static void	_print_ping (void);
 static void	_print_step (char *job_step_id_str);
 static void     _print_version( void );
 static int	_process_command (int argc, char *argv[]);
-static int	_suspend(char *op, char *job_step_id_str);
+static int	_suspend(char *op, char *job_id_str);
 static void	_update_it (int argc, char *argv[]);
 static int	_update_job (int argc, char *argv[]);
 static int	_update_node (int argc, char *argv[]);
@@ -1913,8 +1913,8 @@ scontrol [<OPTION>] [<COMMAND>]                                            \n\
      show <ENTITY> [<ID>]     display state of identified entity, default  \n\
                               is all records.                              \n\
      shutdown                 shutdown slurm controller.                   \n\
-     suspend <job[.step]>     susend specified job or job step             \n\
-     resume <job[.step]>      resume previously suspended job or job step  \n\
+     suspend <job_id>         susend specified job                         \n\
+     resume <job_id>          resume previously suspended job              \n\
      update <SPECIFICATIONS>  update job, node, or partition configuration \n\
      verbose                  enable detailed logging.                     \n\
      version                  display tool version number.                 \n\
@@ -2030,36 +2030,33 @@ static int _checkpoint(char *op, char *job_step_id_str)
 /*
  * _suspend - perform some suspend/resume operation
  * IN op - suspend/resume operation
- * IN job_step_id_str - either a job name (for all steps of the given job) or
- *		a step name: "<jid>.<step_id>"
+ * IN job_id_str - a job id
  * RET 0 if no slurm error, errno otherwise. parsing error prints
  *		error message and returns 0
  */
-static int _suspend(char *op, char *job_step_id_str)
+static int _suspend(char *op, char *job_id_str)
 {
 	int rc = SLURM_SUCCESS;
-	uint32_t job_id = 0, step_id = 0;
+	uint32_t job_id = 0;
 	char *next_str;
 
-	if (job_step_id_str) {
-		job_id = (uint32_t) strtol (job_step_id_str, &next_str, 10);
-		if (next_str[0] == '.') {
-			step_id = (uint32_t) strtol (&next_str[1], &next_str, 10);
-		} else
-			step_id = NO_VAL;
+	if (job_id_str) {
+		job_id = (uint32_t) strtol (job_id_str, &next_str, 10);
 		if (next_str[0] != '\0') {
-			fprintf(stderr, "Invalid job step name\n");
+			fprintf(stderr, "Invalid job id specified\n");
+			exit_code = 1;
 			return 0;
 		}
 	} else {
-		fprintf(stderr, "Invalid job step name\n");
+		fprintf(stderr, "Invalid job id specified\n");
+		exit_code = 1;
 		return 0;
 	}
 
 	if (strncasecmp(op, "suspend", 3) == 0)
-		rc = slurm_suspend (job_id, step_id);
+		rc = slurm_suspend (job_id);
 	else
-		rc = slurm_resume (job_id, step_id);
+		rc = slurm_resume (job_id);
 
 	return rc;
 }
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 8e2da4b6166..b7594a520a4 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -3869,3 +3869,48 @@ extern int job_node_ready(uint32_t job_id, int *ready)
 	return SLURM_SUCCESS;
 }
 
+/*
+ * job__suspend - perform some suspend/resume operation
+ * IN sus_ptr - suspend/resume request message
+ * IN uid - user id of the user issuing the RPC
+ * IN conn_fd - file descriptor on which to send reply
+ * RET 0 on success, otherwise ESLURM error code
+ */
+extern int job_suspend(suspend_msg_t *ckpt_ptr, uid_t uid, 
+		slurm_fd conn_fd)
+{
+	int rc = SLURM_SUCCESS;
+	struct job_record *job_ptr;
+	struct step_record *step_ptr;
+	slurm_msg_t resp_msg;
+	return_code_msg_t rc_msg;
+
+	/* find the job */
+	job_ptr = find_job_record (ckpt_ptr->job_id);
+	if (job_ptr == NULL) {
+		rc = ESLURM_INVALID_JOB_ID;
+		goto reply;
+	}
+	if ((uid != job_ptr->user_id) && (uid != 0)) {
+		rc = ESLURM_ACCESS_DENIED ;
+		goto reply;
+	}
+	if (job_ptr->job_state == JOB_PENDING) {
+		rc = ESLURM_JOB_PENDING;
+		goto reply;
+	} else if (job_ptr->job_state != JOB_RUNNING) {
+		rc = ESLURM_ALREADY_DONE;
+		goto reply;
+	}
+
+	/* Not fully supported yet */
+	rc = ESLURM_NOT_SUPPORTED;
+
+    reply:
+	rc_msg.return_code = rc;
+	resp_msg.msg_type  = RESPONSE_SLURM_RC;
+	resp_msg.data      = &rc_msg;
+	(void) slurm_send_node_msg(conn_fd, &resp_msg);
+	return rc;
+}
+
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index ea4d331dcbe..4ebde8b4ec6 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -1874,10 +1874,10 @@ inline static void _slurm_rpc_suspend(slurm_msg_t * msg)
 
 	START_TIMER;
 	switch (sus_ptr->op) {
-		case SUSPEND_STEP:
+		case SUSPEND_JOB:
 			op = "suspend";
 			break;
-		case RESUME_STEP:
+		case RESUME_JOB:
 			op = "resume";
 			break;
 		default:
@@ -1887,26 +1887,16 @@ inline static void _slurm_rpc_suspend(slurm_msg_t * msg)
 	uid = g_slurm_auth_get_uid(msg->cred);
 
 	lock_slurmctld(job_write_lock);
-	error_code = job_step_suspend(sus_ptr, uid, msg->conn_fd);
+	error_code = job_suspend(sus_ptr, uid, msg->conn_fd);
 	unlock_slurmctld(job_write_lock);
 	END_TIMER;
 
 	if (error_code) {
-		if (sus_ptr->step_id == NO_VAL)
-			info("_slurm_rpc_suspend %s %u: %s", op,
-				sus_ptr->job_id, slurm_strerror(error_code));
-		else
-			info("_slurm_rpc_suspend %s %u.%u  %s", op,
-				sus_ptr->job_id, sus_ptr->step_id,
-				slurm_strerror(error_code));
+		info("_slurm_rpc_suspend %s %u: %s", op,
+			sus_ptr->job_id, slurm_strerror(error_code));
 	} else {
-		if (sus_ptr->step_id == NO_VAL)
-			info("_slurm_rpc_suspend %s for %u %s", op,
-				sus_ptr->job_id, TIME_STR);
-		else
-			info("_slurm_rpc_suspend %s for %u.%u %s", op,
-				sus_ptr->job_id, sus_ptr->step_id,
-				TIME_STR);
+		info("_slurm_rpc_suspend %s for %u %s", op,
+			sus_ptr->job_id, TIME_STR);
 		/* NOTE: This function provides it own locks */
 		schedule_job_save();
 	}
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 61260d5667a..efdb8502668 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -738,8 +738,8 @@ extern int job_step_checkpoint_comp(checkpoint_comp_msg_t *ckpt_ptr,
  * IN conn_fd - file descriptor on which to send reply
  * RET 0 on success, otherwise ESLURM error code
  */
-extern int job_step_suspend(suspend_msg_t *ckpt_ptr,
-                uid_t uid, slurm_fd conn_fd);
+extern int job_suspend(suspend_msg_t *ckpt_ptr, uid_t uid, 
+		slurm_fd conn_fd);
 
 /* 
  * job_complete - note the normal termination the specified job
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 1da9a30eeb9..ab68f2c30ff 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -951,48 +951,3 @@ extern int job_step_checkpoint_comp(checkpoint_comp_msg_t *ckpt_ptr,
 	(void) slurm_send_node_msg(conn_fd, &resp_msg);
 	return rc;
 }
-
-/*
- * job_step_suspend - perform some suspend/resume operation
- * IN sus_ptr - suspend/resume request message
- * IN uid - user id of the user issuing the RPC
- * IN conn_fd - file descriptor on which to send reply
- * RET 0 on success, otherwise ESLURM error code
- */
-extern int job_step_suspend(suspend_msg_t *ckpt_ptr,
-		uid_t uid, slurm_fd conn_fd)
-{
-	int rc = SLURM_SUCCESS;
-	struct job_record *job_ptr;
-	struct step_record *step_ptr;
-	slurm_msg_t resp_msg;
-	return_code_msg_t rc_msg;
-
-	/* find the job */
-	job_ptr = find_job_record (ckpt_ptr->job_id);
-	if (job_ptr == NULL) {
-		rc = ESLURM_INVALID_JOB_ID;
-		goto reply;
-	}
-	if ((uid != job_ptr->user_id) && (uid != 0)) {
-		rc = ESLURM_ACCESS_DENIED ;
-		goto reply;
-	}
-	if (job_ptr->job_state == JOB_PENDING) {
-		rc = ESLURM_JOB_PENDING;
-		goto reply;
-	} else if (job_ptr->job_state != JOB_RUNNING) {
-		rc = ESLURM_ALREADY_DONE;
-		goto reply;
-	}
-
-	/* Not fully supported yet */
-	rc = ESLURM_NOT_SUPPORTED;
-
-    reply:
-	rc_msg.return_code = rc;
-	resp_msg.msg_type  = RESPONSE_SLURM_RC;
-	resp_msg.data      = &rc_msg;
-	(void) slurm_send_node_msg(conn_fd, &resp_msg);
-	return rc;
-}
-- 
GitLab