From 4d43c7f89255c33c40da3bf9c1fd6413c1d3549e Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Tue, 15 Jul 2014 15:50:37 -0700 Subject: [PATCH] Refactor job suspend/resume logic For improved support of job arrays --- src/slurmctld/job_mgr.c | 136 +++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 65 deletions(-) diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 136052e8371..0436f978026 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -11109,95 +11109,58 @@ static int _job_resume_test(struct job_record *job_ptr) } /* - * job_suspend - perform some suspend/resume operation - * IN sus_ptr - suspend/resume request message - * IN uid - user id of the user issuing the RPC - * IN conn_fd - file descriptor on which to send reply, - * -1 if none + * _job_suspend - perform some suspend/resume operation + * job_ptr - job to operate upon + * op IN - operation: suspend/resume * indf_susp IN - set if job is being suspended indefinitely by user or admin * and we should clear it's priority, otherwise suspended * temporarily for gang scheduling - * IN protocol_version - slurm protocol version of client * RET 0 on success, otherwise ESLURM error code */ -extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, - slurm_fd_t conn_fd, bool indf_susp, - uint16_t protocol_version) +static int _job_suspend(struct job_record *job_ptr, uint16_t op, bool indf_susp) { int rc = SLURM_SUCCESS; time_t now = time(NULL); - struct job_record *job_ptr = NULL; - slurm_msg_t resp_msg; - return_code_msg_t rc_msg; - -#ifdef HAVE_BG - rc = ESLURM_NOT_SUPPORTED; - goto reply; -#endif - - /* find the job */ - job_ptr = find_job_record (sus_ptr->job_id); - if (job_ptr == NULL) { - rc = ESLURM_INVALID_JOB_ID; - goto reply; - } - /* validate the request */ - if ((uid != 0) && (uid != getuid())) { - rc = ESLURM_ACCESS_DENIED; - goto reply; - } - if (IS_JOB_PENDING(job_ptr)) { - rc = ESLURM_JOB_PENDING; - goto reply; - } - if (IS_JOB_FINISHED(job_ptr)) { - rc = ESLURM_ALREADY_DONE; - goto reply; - } - if ((sus_ptr->op == SUSPEND_JOB) && - (_job_suspend_switch_test(job_ptr) != SLURM_SUCCESS)) { - rc = ESLURM_NOT_SUPPORTED; - goto reply; - } - if ((sus_ptr->op == RESUME_JOB) && (rc = _job_resume_test(job_ptr))) - goto reply; + if (IS_JOB_PENDING(job_ptr)) + return ESLURM_JOB_PENDING; + if (IS_JOB_FINISHED(job_ptr)) + return ESLURM_ALREADY_DONE; + if ((op == SUSPEND_JOB) && + (_job_suspend_switch_test(job_ptr) != SLURM_SUCCESS)) + return ESLURM_NOT_SUPPORTED; + if ((op == RESUME_JOB) && (rc = _job_resume_test(job_ptr))) + return rc; /* Notify salloc/srun of suspend/resume */ - srun_job_suspend(job_ptr, sus_ptr->op); + srun_job_suspend(job_ptr, op); /* perform the operation */ - if (sus_ptr->op == SUSPEND_JOB) { - if (!IS_JOB_RUNNING(job_ptr)) { - rc = ESLURM_JOB_NOT_RUNNING; - goto reply; - } + if (op == SUSPEND_JOB) { + if (!IS_JOB_RUNNING(job_ptr)) + return ESLURM_JOB_NOT_RUNNING; rc = _suspend_job_nodes(job_ptr, indf_susp); if (rc != SLURM_SUCCESS) - goto reply; - _suspend_job(job_ptr, sus_ptr->op, indf_susp); + return rc; + _suspend_job(job_ptr, op, indf_susp); job_ptr->job_state = JOB_SUSPENDED; if (indf_susp) job_ptr->priority = 0; if (job_ptr->suspend_time) { job_ptr->pre_sus_time += - difftime(now, - job_ptr->suspend_time); + difftime(now, job_ptr->suspend_time); } else { job_ptr->pre_sus_time += - difftime(now, - job_ptr->start_time); + difftime(now, job_ptr->start_time); } suspend_job_step(job_ptr); - } else if (sus_ptr->op == RESUME_JOB) { - if (!IS_JOB_SUSPENDED(job_ptr)) { - rc = ESLURM_JOB_NOT_SUSPENDED; - goto reply; - } + } else if (op == RESUME_JOB) { + if (!IS_JOB_SUSPENDED(job_ptr)) + return ESLURM_JOB_NOT_SUSPENDED; rc = _resume_job_nodes(job_ptr, indf_susp); if (rc != SLURM_SUCCESS) - goto reply; - _suspend_job(job_ptr, sus_ptr->op, indf_susp); + return rc; + _suspend_job(job_ptr, op, indf_susp); if (job_ptr->priority == 0) set_job_prio(job_ptr); job_ptr->job_state = JOB_RUNNING; @@ -11218,8 +11181,7 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, (!job_ptr->preempt_time)) { debug3("Job %u resumed, updating end_time", job_ptr->job_id); - job_ptr->end_time = now + - (job_ptr->time_limit * 60) + job_ptr->end_time = now + (job_ptr->time_limit * 60) - job_ptr->pre_sus_time; } resume_job_step(job_ptr); @@ -11229,6 +11191,50 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, job_ptr->suspend_time = now; jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); + return rc; +} + +/* + * job_suspend - perform some suspend/resume operation + * IN sus_ptr - suspend/resume request message + * IN uid - user id of the user issuing the RPC + * IN conn_fd - file descriptor on which to send reply, + * -1 if none + * indf_susp IN - set if job is being suspended indefinitely by user or admin + * and we should clear it's priority, otherwise suspended + * temporarily for gang scheduling + * IN protocol_version - slurm protocol version of client + * RET 0 on success, otherwise ESLURM error code + */ +extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, + slurm_fd_t conn_fd, bool indf_susp, + uint16_t protocol_version) +{ + int rc = SLURM_SUCCESS; + struct job_record *job_ptr = NULL; + slurm_msg_t resp_msg; + return_code_msg_t rc_msg; + +#ifdef HAVE_BG + rc = ESLURM_NOT_SUPPORTED; + goto reply; +#endif + + /* find the job */ + job_ptr = find_job_record (sus_ptr->job_id); + if (job_ptr == NULL) { + rc = ESLURM_INVALID_JOB_ID; + goto reply; + } + + /* validate the request */ + if ((uid != 0) && (uid != getuid())) { + rc = ESLURM_ACCESS_DENIED; + goto reply; + } + + rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp); + reply: if (conn_fd >= 0) { slurm_msg_t_init(&resp_msg); -- GitLab