From 5a8dbc3904c4e1acf9e128e21750de7796ee7342 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 16 Dec 2005 17:21:21 +0000 Subject: [PATCH] Change suspend/resume from job step to job granularity. --- doc/man/man1/scontrol.1 | 14 +++------- doc/man/man3/slurm_resume.3 | 18 ++++--------- slurm/slurm.h.in | 10 +++---- src/api/suspend.c | 23 +++++++--------- src/common/slurm_protocol_defs.h | 5 ++-- src/common/slurm_protocol_pack.c | 2 -- src/scontrol/scontrol.c | 31 ++++++++++------------ src/slurmctld/job_mgr.c | 45 ++++++++++++++++++++++++++++++++ src/slurmctld/proc_req.c | 24 +++++------------ src/slurmctld/slurmctld.h | 4 +-- src/slurmctld/step_mgr.c | 45 -------------------------------- 11 files changed, 92 insertions(+), 129 deletions(-) diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 5d74c8f405d..a9160d9bd58 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -126,11 +126,8 @@ parameters are to be changed: AuthType, BackupAddr, BackupController, ControlAddr, ControlMach, PluginDir, StateSaveLocation, SlurmctldPort or SlurmdPort. .TP -\fIresume\fP \fIID\fP -Resume a previously suspended job or job step. -\fIID\fP can be used to identify a specific job (e.g. "<job_id>", -which applies to all of its existing steps) -or a specific job step (e.g. "<job_id>.<step_id>"). +\fIresume\fP \fIjob_id\fP +Resume a previously suspended job. .TP \fIshow\fP \fIENTITY\fP \fIID\fP Display the state of the specified entity with the specified identification. @@ -150,12 +147,9 @@ Instruct all Slurm daemons to save current state and terminate. The Slurm controller (slurmctld) forwards the request all other daemons (slurmd daemon on each compute node). .TP -\fIsuspend\fP \fIID\fP -Suspend a running job or job step. +\fIsuspend\fP \fIjob_id\fP +Suspend a running job. Use the \fIresume\fP command to resume its execution. -\fIID\fP can be used to identify a specific job (e.g. "<job_id>", -which applies to all of its existing steps) -or a specific job step (e.g. "<job_id>.<step_id>"). .TP \fIupdate\fP \fISPECIFICATION\fP Update job, node or partition configuration per the supplied specification. diff --git a/doc/man/man3/slurm_resume.3 b/doc/man/man3/slurm_resume.3 index ce247242b95..b6b18dc90e7 100644 --- a/doc/man/man3/slurm_resume.3 +++ b/doc/man/man3/slurm_resume.3 @@ -10,17 +10,13 @@ slurm_suspend, slurm_resume \- Slurm suspend and resume functions .LP int \fBslurm_suspend\fR ( .br - uint32_t \fIjob_id\fP, -.br - uint32_t \fIstep_id\fP + uint32_t \fIjob_id\fP .br ); .LP int \fBslurm_resume\fR ( .br - uint32_t \fIjob_id\fP, -.br - uint32_t \fIstep_id\fP + uint32_t \fIjob_id\fP .br ); @@ -29,18 +25,14 @@ int \fBslurm_resume\fR ( .TP \fIjob_id\fP SLURM job ID to perform the operation upon. -.TP -\fIstep_id\fP -SLURM job step ID to perform the operation upon. -May be NO_VAL if the operation is to be performed on all steps of the specified job. .SH "DESCRIPTION" .LP \fBslurm_suspend\fR -Suspend the specified job or job step. +Suspend the specified job. .LP \fBslurm_resume\fR -Resume execution of a previously suspended job or job step. +Resume execution of a previously suspended job. .SH "RETURN VALUE" .LP @@ -48,7 +40,7 @@ Zero is returned upon success. On error, -1 is returned, and the Slurm error code is set appropriately. .SH "ERRORS" .LP -\fBESLURM_INVALID_JOB_ID\fR the requested job or job step id does not exist. +\fBESLURM_INVALID_JOB_ID\fR the requested job id does not exist. .LP \fBESLURM_ACCESS_DENIED\fR the requesting user lacks authorization for the requested action (e.g. trying to delete or modify another user's job). diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index de3912f874b..d796b63f6d3 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1234,20 +1234,18 @@ extern int slurm_shutdown PARAMS(( uint16_t core )); \*****************************************************************************/ /* - * slurm_suspend - suspend execution of a job step. + * slurm_suspend - suspend execution of a job. * IN job_id - job on which to perform operation - * IN step_id - job step on which to perform operation * RET 0 or a slurm error code */ -extern int slurm_suspend PARAMS(( uint32_t job_id, uint32_t step_id )); +extern int slurm_suspend PARAMS(( uint32_t job_id )); /* - * slurm_resume - resume execution of a previously suspended job step. + * slurm_resume - resume execution of a previously suspended job. * IN job_id - job on which to perform operation - * IN step_id - job step on which to perform operation * RET 0 or a slurm error code */ -extern int slurm_resume PARAMS(( uint32_t job_id, uint32_t step_id )); +extern int slurm_resume PARAMS(( uint32_t job_id )); /*****************************************************************************\ * SLURM JOB CHECKPOINT FUNCTIONS diff --git a/src/api/suspend.c b/src/api/suspend.c index ddf2790a9d1..a04adf756ac 100644 --- a/src/api/suspend.c +++ b/src/api/suspend.c @@ -32,15 +32,15 @@ #include <slurm/slurm.h> #include "src/common/slurm_protocol_api.h" -static int _suspend_op (uint16_t op, uint32_t job_id, uint32_t step_id); +static int _suspend_op (uint16_t op, uint32_t job_id); /* - * _suspend_op - perform a suspend/resume operation for some job step. + * _suspend_op - perform a suspend/resume operation for some job. * IN op - operation to perform * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * RET 0 or a slurm error code */ -static int _suspend_op (uint16_t op, uint32_t job_id, uint32_t step_id) +static int _suspend_op (uint16_t op, uint32_t job_id) { int rc; checkpoint_msg_t sus_req; @@ -48,7 +48,6 @@ static int _suspend_op (uint16_t op, uint32_t job_id, uint32_t step_id) sus_req.op = op; sus_req.job_id = job_id; - sus_req.step_id = step_id; req_msg.msg_type = REQUEST_SUSPEND; req_msg.data = &sus_req; @@ -60,25 +59,21 @@ static int _suspend_op (uint16_t op, uint32_t job_id, uint32_t step_id) } /* - * slurm_suspend - suspend execution of a job step. + * slurm_suspend - suspend execution of a job. * IN job_id - job on which to perform operation - * IN step_id - job step on which to perform operation or NO_VAL - * for all of the job's steps * RET 0 or a slurm error code */ -extern int slurm_suspend (uint32_t job_id, uint32_t step_id) +extern int slurm_suspend (uint32_t job_id) { - return _suspend_op (SUSPEND_STEP, job_id, step_id); + return _suspend_op (SUSPEND_JOB, job_id); } /* - * slurm_resume - resume execution of a previously suspended job step. + * slurm_resume - resume execution of a previously suspended job. * IN job_id - job on which to perform operation - * IN step_id - job step on which to perform operation or NO_VAL - * for all of the job's steps * RET 0 or a slurm error code */ -extern int slurm_resume (uint32_t job_id, uint32_t step_id) +extern int slurm_resume (uint32_t job_id) { - return _suspend_op (RESUME_STEP, job_id, step_id); + return _suspend_op (RESUME_JOB, job_id); } diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index ae9f88618bf..a10462dd1d4 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -66,8 +66,8 @@ enum part_shared { }; enum suspend_opts { - SUSPEND_STEP, /* Suspend a job step now */ - RESUME_STEP /* Resume a job step now */ + SUSPEND_JOB, /* Suspend a job now */ + RESUME_JOB /* Resume a job now */ }; /* SLURM Message types */ @@ -466,7 +466,6 @@ typedef struct checkpoint_resp_msg { typedef struct suspend_msg { uint16_t op; /* suspend operation, see enum suspend_opts */ uint32_t job_id; /* slurm job_id */ - uint32_t step_id; /* slurm step_id */ } suspend_msg_t; typedef struct jobacct_msg { diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 179fc186465..18011f0fb33 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -3226,7 +3226,6 @@ static void _pack_suspend_msg(suspend_msg_t *msg, Buf buffer) pack16 ( msg -> op, buffer ) ; pack32 ( msg -> job_id, buffer ) ; - pack32 ( msg -> step_id, buffer ) ; } static int _unpack_suspend_msg(suspend_msg_t **msg_ptr, Buf buffer) @@ -3239,7 +3238,6 @@ static int _unpack_suspend_msg(suspend_msg_t **msg_ptr, Buf buffer) safe_unpack16 ( & msg -> op , buffer ) ; safe_unpack32 ( & msg -> job_id , buffer ) ; - safe_unpack32 ( & msg -> step_id , buffer ) ; return SLURM_SUCCESS; unpack_error: diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index e879f868122..b7f11852ada 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -110,7 +110,7 @@ static void _print_ping (void); static void _print_step (char *job_step_id_str); static void _print_version( void ); static int _process_command (int argc, char *argv[]); -static int _suspend(char *op, char *job_step_id_str); +static int _suspend(char *op, char *job_id_str); static void _update_it (int argc, char *argv[]); static int _update_job (int argc, char *argv[]); static int _update_node (int argc, char *argv[]); @@ -1913,8 +1913,8 @@ scontrol [<OPTION>] [<COMMAND>] \n\ show <ENTITY> [<ID>] display state of identified entity, default \n\ is all records. \n\ shutdown shutdown slurm controller. \n\ - suspend <job[.step]> susend specified job or job step \n\ - resume <job[.step]> resume previously suspended job or job step \n\ + suspend <job_id> susend specified job \n\ + resume <job_id> resume previously suspended job \n\ update <SPECIFICATIONS> update job, node, or partition configuration \n\ verbose enable detailed logging. \n\ version display tool version number. \n\ @@ -2030,36 +2030,33 @@ static int _checkpoint(char *op, char *job_step_id_str) /* * _suspend - perform some suspend/resume operation * IN op - suspend/resume operation - * IN job_step_id_str - either a job name (for all steps of the given job) or - * a step name: "<jid>.<step_id>" + * IN job_id_str - a job id * RET 0 if no slurm error, errno otherwise. parsing error prints * error message and returns 0 */ -static int _suspend(char *op, char *job_step_id_str) +static int _suspend(char *op, char *job_id_str) { int rc = SLURM_SUCCESS; - uint32_t job_id = 0, step_id = 0; + uint32_t job_id = 0; char *next_str; - if (job_step_id_str) { - job_id = (uint32_t) strtol (job_step_id_str, &next_str, 10); - if (next_str[0] == '.') { - step_id = (uint32_t) strtol (&next_str[1], &next_str, 10); - } else - step_id = NO_VAL; + if (job_id_str) { + job_id = (uint32_t) strtol (job_id_str, &next_str, 10); if (next_str[0] != '\0') { - fprintf(stderr, "Invalid job step name\n"); + fprintf(stderr, "Invalid job id specified\n"); + exit_code = 1; return 0; } } else { - fprintf(stderr, "Invalid job step name\n"); + fprintf(stderr, "Invalid job id specified\n"); + exit_code = 1; return 0; } if (strncasecmp(op, "suspend", 3) == 0) - rc = slurm_suspend (job_id, step_id); + rc = slurm_suspend (job_id); else - rc = slurm_resume (job_id, step_id); + rc = slurm_resume (job_id); return rc; } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 8e2da4b6166..b7594a520a4 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3869,3 +3869,48 @@ extern int job_node_ready(uint32_t job_id, int *ready) return SLURM_SUCCESS; } +/* + * job__suspend - perform some suspend/resume operation + * IN sus_ptr - suspend/resume request message + * IN uid - user id of the user issuing the RPC + * IN conn_fd - file descriptor on which to send reply + * RET 0 on success, otherwise ESLURM error code + */ +extern int job_suspend(suspend_msg_t *ckpt_ptr, uid_t uid, + slurm_fd conn_fd) +{ + int rc = SLURM_SUCCESS; + struct job_record *job_ptr; + struct step_record *step_ptr; + slurm_msg_t resp_msg; + return_code_msg_t rc_msg; + + /* find the job */ + job_ptr = find_job_record (ckpt_ptr->job_id); + if (job_ptr == NULL) { + rc = ESLURM_INVALID_JOB_ID; + goto reply; + } + if ((uid != job_ptr->user_id) && (uid != 0)) { + rc = ESLURM_ACCESS_DENIED ; + goto reply; + } + if (job_ptr->job_state == JOB_PENDING) { + rc = ESLURM_JOB_PENDING; + goto reply; + } else if (job_ptr->job_state != JOB_RUNNING) { + rc = ESLURM_ALREADY_DONE; + goto reply; + } + + /* Not fully supported yet */ + rc = ESLURM_NOT_SUPPORTED; + + reply: + rc_msg.return_code = rc; + resp_msg.msg_type = RESPONSE_SLURM_RC; + resp_msg.data = &rc_msg; + (void) slurm_send_node_msg(conn_fd, &resp_msg); + return rc; +} + diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index ea4d331dcbe..4ebde8b4ec6 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1874,10 +1874,10 @@ inline static void _slurm_rpc_suspend(slurm_msg_t * msg) START_TIMER; switch (sus_ptr->op) { - case SUSPEND_STEP: + case SUSPEND_JOB: op = "suspend"; break; - case RESUME_STEP: + case RESUME_JOB: op = "resume"; break; default: @@ -1887,26 +1887,16 @@ inline static void _slurm_rpc_suspend(slurm_msg_t * msg) uid = g_slurm_auth_get_uid(msg->cred); lock_slurmctld(job_write_lock); - error_code = job_step_suspend(sus_ptr, uid, msg->conn_fd); + error_code = job_suspend(sus_ptr, uid, msg->conn_fd); unlock_slurmctld(job_write_lock); END_TIMER; if (error_code) { - if (sus_ptr->step_id == NO_VAL) - info("_slurm_rpc_suspend %s %u: %s", op, - sus_ptr->job_id, slurm_strerror(error_code)); - else - info("_slurm_rpc_suspend %s %u.%u %s", op, - sus_ptr->job_id, sus_ptr->step_id, - slurm_strerror(error_code)); + info("_slurm_rpc_suspend %s %u: %s", op, + sus_ptr->job_id, slurm_strerror(error_code)); } else { - if (sus_ptr->step_id == NO_VAL) - info("_slurm_rpc_suspend %s for %u %s", op, - sus_ptr->job_id, TIME_STR); - else - info("_slurm_rpc_suspend %s for %u.%u %s", op, - sus_ptr->job_id, sus_ptr->step_id, - TIME_STR); + info("_slurm_rpc_suspend %s for %u %s", op, + sus_ptr->job_id, TIME_STR); /* NOTE: This function provides it own locks */ schedule_job_save(); } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 61260d5667a..efdb8502668 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -738,8 +738,8 @@ extern int job_step_checkpoint_comp(checkpoint_comp_msg_t *ckpt_ptr, * IN conn_fd - file descriptor on which to send reply * RET 0 on success, otherwise ESLURM error code */ -extern int job_step_suspend(suspend_msg_t *ckpt_ptr, - uid_t uid, slurm_fd conn_fd); +extern int job_suspend(suspend_msg_t *ckpt_ptr, uid_t uid, + slurm_fd conn_fd); /* * job_complete - note the normal termination the specified job diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 1da9a30eeb9..ab68f2c30ff 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -951,48 +951,3 @@ extern int job_step_checkpoint_comp(checkpoint_comp_msg_t *ckpt_ptr, (void) slurm_send_node_msg(conn_fd, &resp_msg); return rc; } - -/* - * job_step_suspend - perform some suspend/resume operation - * IN sus_ptr - suspend/resume request message - * IN uid - user id of the user issuing the RPC - * IN conn_fd - file descriptor on which to send reply - * RET 0 on success, otherwise ESLURM error code - */ -extern int job_step_suspend(suspend_msg_t *ckpt_ptr, - uid_t uid, slurm_fd conn_fd) -{ - int rc = SLURM_SUCCESS; - struct job_record *job_ptr; - struct step_record *step_ptr; - slurm_msg_t resp_msg; - return_code_msg_t rc_msg; - - /* find the job */ - job_ptr = find_job_record (ckpt_ptr->job_id); - if (job_ptr == NULL) { - rc = ESLURM_INVALID_JOB_ID; - goto reply; - } - if ((uid != job_ptr->user_id) && (uid != 0)) { - rc = ESLURM_ACCESS_DENIED ; - goto reply; - } - if (job_ptr->job_state == JOB_PENDING) { - rc = ESLURM_JOB_PENDING; - goto reply; - } else if (job_ptr->job_state != JOB_RUNNING) { - rc = ESLURM_ALREADY_DONE; - goto reply; - } - - /* Not fully supported yet */ - rc = ESLURM_NOT_SUPPORTED; - - reply: - rc_msg.return_code = rc; - resp_msg.msg_type = RESPONSE_SLURM_RC; - resp_msg.data = &rc_msg; - (void) slurm_send_node_msg(conn_fd, &resp_msg); - return rc; -} -- GitLab