From 52564c693588f27bf210451e13d5541650cd572d Mon Sep 17 00:00:00 2001 From: David Bigagli <david@schedmd.com> Date: Fri, 11 Oct 2013 14:12:24 -0700 Subject: [PATCH] Implemented requeue of finished jobs, requeuehold option and JOB_SPECIAL_EXIT state. --- RELEASE_NOTES | 5 ++ doc/man/man1/scontrol.1 | 14 +++- doc/man/man5/slurm.conf.5 | 13 +++- slurm/slurm.h.in | 10 ++- src/api/suspend.c | 8 ++- src/common/slurm_protocol_defs.c | 10 +++ src/common/slurm_protocol_defs.h | 2 +- src/common/slurm_protocol_pack.c | 71 ++++++++++++++++++-- src/scontrol/scontrol.c | 27 +++++++- src/scontrol/scontrol.h | 3 +- src/scontrol/update_job.c | 107 ++++++++++++++++++++++++++---- src/slurmctld/job_mgr.c | 110 ++++++++++++++++++++++++++----- src/slurmctld/job_scheduler.c | 14 ++++ src/slurmctld/node_mgr.c | 2 + src/slurmctld/node_scheduler.c | 5 +- src/slurmctld/proc_req.c | 60 +++++++++++++---- src/slurmctld/slurmctld.h | 15 ++++- src/slurmd/slurmstepd/mgr.c | 5 +- src/sview/job_info.c | 2 +- 19 files changed, 419 insertions(+), 64 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 43d1ac3bb91..b67250aba21 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -79,6 +79,11 @@ COMMAND CHANGES (see man pages for details) -- Add stdin/out/err to sview job output. -- Added a new option to the scontrol command to view licenses that are configured in use and avalable. 'scontrol show licenses'. + -- Permit jobs in finished state to be requeued. + -- Added a new option to scontrol to put a requeued job on hold. A requeued + job can be put in a new special state called SPECIAL_EXIT indicating + the job has exited with a special value. + "scontrol requeuehold state=SpecialExit 123". OTHER CHANGES ============= diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 428c344fa61..f34f767d83c 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -265,9 +265,21 @@ Release a previously held job to begin execution. Also see \fBhold\fR. .TP \fBrequeue\fP \fIjob_id\fP -Requeue a running or pending SLURM batch job. +Requeue a running, suspended or finished SLURM batch job into pending state. .TP +\fBrequeuehold\fP \fIjob_id\fP +Requeue a running, suspended or finished SLURM batch job into pending state, +moreover the job is put in held state (priority zero). A held job can be +release using scontrol to reset its priority (e.g. "scontrol release <job_id>"). +.TP 20 +\fIState=SpecialExit\fP +The "SpecialExit" keyword specifies that the job has to be put in a +special state \fBJOB_SPECIAL_EXIT\fP. The "scontrol show job" command +will display the JobState as \fBSPECIAL_EXIT\fP, while the "squeue" +command as \fBSE\fP. +.TP + \fBresume\fP \fIjob_id\fP Resume a previously suspended job. Also see \fBsuspend\fR. diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index bba777c89b0..a6cb502c03a 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -585,7 +585,7 @@ Fully qualified pathname of a program for the slurmctld to execute upon termination of a job allocation (e.g. "/usr/local/slurm/epilog_controller"). The program executes as SlurmUser, which gives it permission to drain -nodes and requeue the job if a failure occurs or cancel the job if appropriate. +nodes and requeue the job if a failure occurs (See scontrol(1)). Exactly what the program does and how it accomplishes this is completely at the discretion of the system administrator. Information about the job being initiated, it's allocated nodes, etc. are @@ -3564,9 +3564,18 @@ The highest exit code of all of the job steps. Available in \fBEpilogSlurmctld\fR only. .TP \fBSLURM_JOB_EXIT_CODE\fR -The exit code of the job script (or salloc). +The exit code of the job script (or salloc). The value is the status +as returned by the wait() system call (See wait(2)) +Available in \fBEpilogSlurmctld\fR only. +.TP +\fBSLURM_JOB_EXIT_CODE2\fR +The exit code of the job script (or salloc). The value has the format +<exit>:<sig>. The first number is the exit code, typically as set by the +exit() function. The second number of the signal that caused the process to +terminante if it was terminated by a signal. Available in \fBEpilogSlurmctld\fR only. .TP + \fBSLURM_JOB_GID\fR Group ID of the job's owner. Available in \fBPrologSlurmctld\fR and \fBEpilogSlurmctld\fR only. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 7d146d5ced1..2f4d364f66a 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -273,6 +273,9 @@ enum job_states { #define JOB_RESIZING 0x2000 /* Size of job about to change, flag set * before calling accounting functions * immediately before job changes size */ +#define JOB_SPECIAL_EXIT 0x1000 /* Requeue an exit job in hold */ +#define JOB_REQUEUE_HOLD 0x800 /* Requeue any job in hold */ +#define JOB_REQUEUE 0x400 /* Requeue job in completign state */ #define READY_JOB_FATAL -2 /* fatal error */ #define READY_JOB_ERROR -1 /* ordinary error */ @@ -1372,6 +1375,11 @@ typedef struct suspend_msg { uint32_t job_id; /* slurm job_id */ } suspend_msg_t; +typedef struct requeue_msg { + uint32_t job_id; /* slurm job_id */ + uint32_t state; /* JobExitRequeue | Hold */ +} requeue_msg_t; + typedef struct { uint16_t ckpt_interval; /* checkpoint interval in minutes */ uint32_t cpu_count; /* number of required processors */ @@ -3683,7 +3691,7 @@ extern int slurm_resume PARAMS((uint32_t job_id)); * IN job_id - job on which to perform operation * RET 0 or a slurm error code */ -extern int slurm_requeue PARAMS((uint32_t job_id)); +extern int slurm_requeue PARAMS((uint32_t job_id, uint32_t state)); /*****************************************************************************\ * SLURM JOB CHECKPOINT FUNCTIONS diff --git a/src/api/suspend.c b/src/api/suspend.c index 342eecc3c8c..3a42224e0b3 100644 --- a/src/api/suspend.c +++ b/src/api/suspend.c @@ -97,14 +97,17 @@ extern int slurm_resume (uint32_t job_id) * IN job_id - job on which to perform operation * RET 0 or a slurm error code */ -extern int slurm_requeue (uint32_t job_id) +extern int slurm_requeue(uint32_t job_id, + uint32_t state) { int rc; - job_id_msg_t requeue_req; + requeue_msg_t requeue_req; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); + requeue_req.job_id = job_id; + requeue_req.state = state; req_msg.msg_type = REQUEST_JOB_REQUEUE; req_msg.data = &requeue_req; @@ -114,4 +117,3 @@ extern int slurm_requeue (uint32_t job_id) slurm_seterrno(rc); return rc; } - diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 7ed76261d70..37948fe98ec 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -910,6 +910,12 @@ extern void slurm_free_suspend_msg(suspend_msg_t *msg) xfree(msg); } +extern void +slurm_free_requeue_msg(requeue_msg_t *msg) +{ + xfree(msg); +} + extern void slurm_free_suspend_int_msg(suspend_int_msg_t *msg) { if (msg) { @@ -1196,6 +1202,8 @@ extern char *job_state_string(uint16_t inx) return "CONFIGURING"; if (inx & JOB_RESIZING) return "RESIZING"; + if (inx & JOB_SPECIAL_EXIT) + return "SPECIAL_EXIT"; /* Process JOB_STATE_BASE */ switch (inx & JOB_STATE_BASE) { @@ -1233,6 +1241,8 @@ extern char *job_state_string_compact(uint16_t inx) return "CF"; if (inx & JOB_RESIZING) return "RS"; + if (inx & JOB_SPECIAL_EXIT) + return "SE"; /* Process JOB_STATE_BASE */ switch (inx & JOB_STATE_BASE) { diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 9ee3868c7e3..fed4cac32d6 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -1198,7 +1198,7 @@ extern void slurm_free_job_notify_msg(job_notify_msg_t * msg); extern void slurm_free_accounting_update_msg(accounting_update_msg_t *msg); extern void slurm_free_spank_env_request_msg(spank_env_request_msg_t *msg); extern void slurm_free_spank_env_responce_msg(spank_env_responce_msg_t *msg); - +extern void slurm_free_requeue_msg(requeue_msg_t *); extern int slurm_free_msg_data(slurm_msg_type_t type, void *data); extern void slurm_free_license_info_request_msg(license_info_request_msg_t *msg); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 4db0c5ed1fc..5180d666ff9 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -655,6 +655,15 @@ static inline void _pack_license_info_msg(slurm_msg_t *msg, Buf buffer); static int _unpack_license_info_msg(license_info_msg_t **msg, Buf buffer, uint16_t protocol_version); +static void +_pack_job_requeue_msg(requeue_msg_t *msg, + Buf buf, + uint16_t protocol_version); +static int +_unpack_job_requeue_msg(requeue_msg_t **msg, + Buf buf, + uint16_t protocol_version); + /* pack_header * packs a slurm protocol header that precedes every slurm message * IN header - the header structure to pack @@ -1133,12 +1142,17 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) break; case REQUEST_JOB_READY: - case REQUEST_JOB_REQUEUE: case REQUEST_JOB_INFO_SINGLE: _pack_job_ready_msg((job_id_msg_t *)msg->data, buffer, msg->protocol_version); break; + case REQUEST_JOB_REQUEUE: + _pack_job_requeue_msg((requeue_msg_t *)msg->data, + buffer, + msg->protocol_version); + break; + case REQUEST_JOB_USER_INFO: _pack_job_user_msg((job_user_id_msg_t *)msg->data, buffer, msg->protocol_version); @@ -1731,11 +1745,16 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) break; case REQUEST_JOB_READY: - case REQUEST_JOB_REQUEUE: case REQUEST_JOB_INFO_SINGLE: rc = _unpack_job_ready_msg((job_id_msg_t **) - & msg->data, buffer, - msg->protocol_version); + & msg->data, buffer, + msg->protocol_version); + break; + + case REQUEST_JOB_REQUEUE: + rc = _unpack_job_requeue_msg((requeue_msg_t **)&msg->data, + buffer, + msg->protocol_version); break; case REQUEST_JOB_USER_INFO: @@ -9597,6 +9616,50 @@ unpack_error: return SLURM_ERROR; } +static void +_pack_job_requeue_msg(requeue_msg_t *msg, Buf buf, uint16_t protocol_version) +{ + uint16_t cc; + + xassert(msg != NULL); + cc = 0; + + if (protocol_version >= SLURM_13_12_PROTOCOL_VERSION) { + pack32(msg->job_id, buf); + pack32(msg->state, buf); + } else { + /* For backward compatibility we emulate _pack_job_ready_msg() + */ + pack32(msg->job_id, buf); + pack16(cc, buf); + } +} + +static int +_unpack_job_requeue_msg(requeue_msg_t **msg, Buf buf, uint16_t protocol_version) +{ + uint16_t cc; + + *msg = xmalloc(sizeof(requeue_msg_t)); + + if (protocol_version >= SLURM_13_12_PROTOCOL_VERSION) { + safe_unpack32(&(*msg)->job_id, buf); + safe_unpack32(&(*msg)->state, buf); + } else { + /* Translate job_id_msg_t into requeue_msg_t + */ + safe_unpack32(&(*msg)->job_id, buf) ; + safe_unpack16(&cc, buf); + (*msg)->state = cc; + } + + return SLURM_SUCCESS; +unpack_error: + slurm_free_requeue_msg(*msg); + *msg = NULL; + return SLURM_ERROR; +} + static void _pack_job_user_msg(job_user_id_msg_t * msg, Buf buffer, uint16_t protocol_version) diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index be6b2638d64..350254a0b93 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -847,7 +847,30 @@ _process_command (int argc, char *argv[]) } } else if (strncasecmp (tag, "requeue", MAX(tag_len, 3)) == 0) { - if (argc > 2) { + if (argc > 3) { + exit_code = 1; + if (quiet_flag != 1) + fprintf(stderr, + "too many arguments for keyword:%s\n", + tag); + } else if (argc < 2) { + exit_code = 1; + if (quiet_flag != 1) + fprintf(stderr, + "too few arguments for keyword:%s\n", + tag); + } else { + error_code = scontrol_requeue((argc - 1), &argv[1]); + if (error_code) { + exit_code = 1; + if (quiet_flag != 1) + slurm_perror ("slurm_requeue error"); + } + } + + } + else if (strncasecmp(tag, "requeuehold", 11) == 0) { + if (argc > 3) { exit_code = 1; if (quiet_flag != 1) fprintf(stderr, @@ -860,7 +883,7 @@ _process_command (int argc, char *argv[]) "too few arguments for keyword:%s\n", tag); } else { - error_code = scontrol_requeue(argv[1]); + error_code = scontrol_requeue_hold((argc - 1), &argv[1]); if (error_code) { exit_code = 1; if (quiet_flag != 1) diff --git a/src/scontrol/scontrol.h b/src/scontrol/scontrol.h index d8eeb580fd6..87c3002b0c3 100644 --- a/src/scontrol/scontrol.h +++ b/src/scontrol/scontrol.h @@ -152,7 +152,8 @@ extern void scontrol_print_block (char *block_name); extern void scontrol_print_res (char *reservation_name); extern void scontrol_print_step (char *job_step_id_str); extern void scontrol_print_topo (char *node_list); -extern int scontrol_requeue(char *job_step_id_str); +extern int scontrol_requeue(int argc, char **argv); +extern int scontrol_requeue_hold(int argc, char **argv); extern int scontrol_suspend(char *op, char *job_id_str); extern int scontrol_update_front_end (int argc, char *argv[]); extern int scontrol_update_job (int argc, char *argv[]); diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index 9780aa2be98..6231acad5f4 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -46,6 +46,7 @@ static int _parse_checkpoint_args(int argc, char **argv, static int _parse_restart_args(int argc, char **argv, uint16_t *stick, char **image_dir); static void _update_job_size(uint32_t job_id); +static int _parse_requeue_flags(char *, uint32_t *state_flags); /* * scontrol_checkpoint - perform some checkpoint/resume operation @@ -338,26 +339,67 @@ scontrol_suspend(char *op, char *job_id_str) * error message and returns 0 */ extern int -scontrol_requeue(char *job_id_str) +scontrol_requeue(int argc, char **argv) { int rc = SLURM_SUCCESS; uint32_t job_id = 0; char *next_str; - if (job_id_str) { - job_id = (uint32_t) strtol (job_id_str, &next_str, 10); - if (next_str[0] != '\0') { - fprintf(stderr, "Invalid job id specified\n"); - exit_code = 1; - return 0; - } - } else { + if (! argv[0]) { + exit_code = 1; + return 0; + } + + job_id = (uint32_t)strtol(argv[0], &next_str, 10); + if (next_str[0] != '\0') { + fprintf(stderr, "Invalid job id specified\n"); + exit_code = 1; + return 0; + } + + rc = slurm_requeue(job_id, 0); + + return rc; +} + +extern int +scontrol_requeue_hold(int argc, char **argv) +{ + int rc = SLURM_SUCCESS; + uint32_t job_id = 0; + char *next_str; + char *job_id_str; + uint32_t state_flag; + + state_flag = 0; + + if (argc == 1) + job_id_str = argv[0]; + else + job_id_str = argv[1]; + + job_id = (uint32_t)strtol(job_id_str, &next_str, 10); + if (next_str[0] != '\0') { fprintf(stderr, "Invalid job id specified\n"); exit_code = 1; return 0; } - rc = slurm_requeue (job_id); + if (argc == 2) { + rc = _parse_requeue_flags(argv[0], &state_flag); + if (rc < 0) { + error("Invalid state specification %s", argv[0]); + exit_code = 1; + return 0; + } + } + state_flag |= JOB_REQUEUE_HOLD; + + /* Go and requeue the state either in + * JOB_SPECIAL_EXIT or HELD state. + */ + rc = slurm_requeue(job_id, state_flag); + return rc; } @@ -856,12 +898,12 @@ static void _update_job_size(uint32_t job_id) (void) unlink(fname_csh); (void) unlink(fname_sh); if (!(resize_csh = fopen(fname_csh, "w"))) { - fprintf(stderr, "Could not create file %s: %s\n", fname_csh, + fprintf(stderr, "Could not create file %s: %s\n", fname_csh, strerror(errno)); goto fini; } if (!(resize_sh = fopen(fname_sh, "w"))) { - fprintf(stderr, "Could not create file %s: %s\n", fname_sh, + fprintf(stderr, "Could not create file %s: %s\n", fname_sh, strerror(errno)); goto fini; } @@ -875,9 +917,9 @@ static void _update_job_size(uint32_t job_id) alloc_info->node_list); } if (getenv("SLURM_JOB_NODELIST")) { - fprintf(resize_sh, "export SLURM_JOB_NODELIST=\"%s\"\n", + fprintf(resize_sh, "export SLURM_JOB_NODELIST=\"%s\"\n", alloc_info->node_list); - fprintf(resize_csh, "setenv SLURM_JOB_NODELIST \"%s\"\n", + fprintf(resize_csh, "setenv SLURM_JOB_NODELIST \"%s\"\n", alloc_info->node_list); } if (getenv("SLURM_NNODES")) { @@ -921,3 +963,40 @@ fini: slurm_free_resource_allocation_response_msg(alloc_info); if (resize_sh) fclose(resize_sh); } + +/* _parse_requeue_args() + */ +static int +_parse_requeue_flags(char *s, uint32_t *state) +{ + char *p; + char *p0; + char *z; + + p0 = p = xstrdup(s); + /* search for = + */ + z = strchr(p, '='); + if (!z) { + return -1; + } + *z = 0; + + /* validate flags keyword + */ + if (strncasecmp(p, "state", 5) != 0) { + return -1; + } + ++z; + + p = z; + if (strncasecmp(p, "specialexit", 11) == 0 + || strncasecmp(p, "se", 2) == 0) { + *state = JOB_SPECIAL_EXIT; + xfree(p0); + return 0; + } + + xfree(p0); + return -1; +} diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 58812b0dabb..4f6d586e2bf 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2198,6 +2198,7 @@ extern int kill_job_by_front_end_name(char *node_name) } if (job_ptr->node_cnt == 0) { job_ptr->job_state &= (~JOB_COMPLETING); + job_hold_requeue(job_ptr); delete_step_records(job_ptr); slurm_sched_g_schedule(); } @@ -2423,6 +2424,7 @@ extern int kill_running_job_by_node_name(char *node_name) } if (job_ptr->node_cnt == 0) { job_ptr->job_state &= (~JOB_COMPLETING); + job_hold_requeue(job_ptr); delete_step_records(job_ptr); slurm_sched_g_schedule(); } @@ -3478,7 +3480,7 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, uint32_t job_comp_flag = 0; bool suspended = false; - info("completing job %u", job_id); + info("completing job %u status %d", job_id, job_return_code); job_ptr = find_job_record(job_id); if (job_ptr == NULL) { info("job_complete: invalid JobId=%u", job_id); @@ -8079,6 +8081,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) info("sched: update_job: releasing user hold " "for job_id %u", job_specs->job_id); job_ptr->state_reason = WAIT_NO_REASON; + job_ptr->job_state &= ~JOB_SPECIAL_EXIT; xfree(job_ptr->state_desc); } else if (authorized || (job_ptr->priority > job_specs->priority)) { @@ -8106,6 +8109,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) } else if ((job_ptr->state_reason == WAIT_HELD) || (job_ptr->state_reason == WAIT_HELD_USER)) { job_ptr->state_reason = WAIT_NO_REASON; + job_ptr->job_state &= ~JOB_SPECIAL_EXIT; xfree(job_ptr->state_desc); } } else { @@ -10232,8 +10236,11 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, * IN preempt - true if job being preempted * RET 0 on success, otherwise ESLURM error code */ -extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd_t conn_fd, - uint16_t protocol_version, bool preempt) +extern int job_requeue(uid_t uid, + uint32_t job_id, + slurm_fd_t conn_fd, + uint16_t protocol_version, + bool preempt) { int rc = SLURM_SUCCESS; struct job_record *job_ptr = NULL; @@ -10241,9 +10248,10 @@ extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd_t conn_fd, slurm_msg_t resp_msg; return_code_msg_t rc_msg; time_t now = time(NULL); + bool is_running; /* find the job */ - job_ptr = find_job_record (job_id); + job_ptr = find_job_record(job_id); if (job_ptr == NULL) { rc = ESLURM_INVALID_JOB_ID; goto reply; @@ -10256,14 +10264,12 @@ extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd_t conn_fd, rc = ESLURM_ACCESS_DENIED; goto reply; } - if (IS_JOB_FINISHED(job_ptr)) { - rc = ESLURM_ALREADY_DONE; - goto reply; - } + if ((job_ptr->details == NULL) || (job_ptr->details->requeue == 0)) { rc = ESLURM_DISABLED; goto reply; } + if (IS_JOB_COMPLETING(job_ptr)) { if (IS_JOB_PENDING(job_ptr)) goto reply; /* already requeued */ @@ -10281,13 +10287,6 @@ extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd_t conn_fd, goto reply; } - if (!IS_JOB_SUSPENDED(job_ptr) && !IS_JOB_RUNNING(job_ptr)) { - error("job_requeue job %u state is bad %s", job_id, - job_state_string(job_ptr->job_state)); - rc = EINVAL; - goto reply; - } - slurm_sched_g_requeue(job_ptr, "Job requeued by user/admin"); last_job_update = now; @@ -10308,13 +10307,28 @@ extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd_t conn_fd, else job_ptr->end_time = now; + /* Save the state of the job so that + * we deallocate the nodes if is in + * running state. + */ + is_running = false; + if (IS_JOB_SUSPENDED(job_ptr) + || IS_JOB_RUNNING(job_ptr)) + is_running = true; + /* We want this job to look like it was cancelled in the * accounting logs. Set a new submit time so the restarted * job looks like a new job. */ job_ptr->job_state = JOB_CANCELLED; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, true); - deallocate_nodes(job_ptr, false, suspended, preempt); + + /* Deallocate resources only if the job + * has some. + */ + if (is_running) + deallocate_nodes(job_ptr, false, suspended, preempt); + xfree(job_ptr->details->req_node_layout); /* do this after the epilog complete, setting it here is too early */ @@ -10334,7 +10348,7 @@ extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd_t conn_fd, * to add it again. */ acct_policy_add_job_submit(job_ptr); - reply: +reply: if (conn_fd >= 0) { slurm_msg_t_init(&resp_msg); resp_msg.protocol_version = protocol_version; @@ -11285,9 +11299,71 @@ extern void build_cg_bitmap(struct job_record *job_ptr) job_ptr->node_bitmap_cg = bit_copy(job_ptr->node_bitmap); if (bit_set_count(job_ptr->node_bitmap_cg) == 0) job_ptr->job_state &= (~JOB_COMPLETING); + info("%s: JOB_COMPLETING cleaned state 0x%x", __func__, job_ptr->job_state); } else { error("build_cg_bitmap: node_bitmap is NULL"); job_ptr->node_bitmap_cg = bit_alloc(node_record_count); job_ptr->job_state &= (~JOB_COMPLETING); } + job_hold_requeue(job_ptr); +} + +/* job_hold_requeue() + * + * Requeue the job either in JOB_SPECIAL_EXIT state + * in which is put on hold or if JOB_REQUEUE_HOLD is + * specified don't change its state. The requeue + * can happen directly from job_requeue() or from + * job_epilog_complete() after the last component + * has finished. + */ +void +job_hold_requeue(struct job_record *job_ptr) +{ + uint32_t state; + + xassert(job_ptr); + + state = job_ptr->job_state; + + if (! (state & JOB_SPECIAL_EXIT) + && ! (state & JOB_REQUEUE_HOLD) + && ! (state & JOB_REQUEUE)) + return; + + debug("%s: job %u state 0x%x", __func__, job_ptr->job_id, state); + + /* We have to set the state here in case + * we are not requeueing the job from + * job_requeue() but from job_epilog_complete(). + */ + job_ptr->job_state = JOB_PENDING; + + /* Test if user wants to requeue the job + * in hold or with a special exit value. + */ + if (state & JOB_SPECIAL_EXIT) { + /* JOB_SPECIAL_EXIT means requeue the + * the job, put it on hold and display + * it as JOB_SPECIAL_EXIT. + */ + job_ptr->job_state |= JOB_SPECIAL_EXIT; + job_ptr->state_reason = WAIT_HELD_USER; + job_ptr->priority = 0; + } + + if (state & JOB_REQUEUE_HOLD) { + /* The job will be requeued in status + * PENDING and held + */ + job_ptr->state_reason = WAIT_HELD_USER; + job_ptr->priority = 0; + } + + job_ptr->job_state &= ~JOB_REQUEUE_HOLD; + job_ptr->job_state &= ~JOB_REQUEUE; + + debug("%s: job %u state 0x%x reason %u priority %d", __func__, + job_ptr->job_id, job_ptr->job_state, + job_ptr->state_reason, job_ptr->priority); } diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index b906db032fe..8430d0d0575 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -2010,6 +2010,9 @@ extern int epilog_slurmctld(struct job_record *job_ptr) static char **_build_env(struct job_record *job_ptr) { char **my_env, *name; + char buf[32]; + int exit_code; + int signal; my_env = xmalloc(sizeof(char *)); my_env[0] = NULL; @@ -2052,6 +2055,17 @@ static char **_build_env(struct job_record *job_ptr) } setenvf(&my_env, "SLURM_JOB_DERIVED_EC", "%u", job_ptr->derived_ec); + + exit_code = signal = 0; + if (WIFEXITED(job_ptr->exit_code)) { + exit_code = WEXITSTATUS(job_ptr->exit_code); + } + if (WIFSIGNALED(job_ptr->exit_code)) { + signal = WTERMSIG(job_ptr->exit_code); + } + sprintf(buf, "%d:%d", exit_code, signal); + setenvf(&my_env, "SLURM_JOB_EXIT_CODE2", "%s", buf); + setenvf(&my_env, "SLURM_JOB_EXIT_CODE", "%u", job_ptr->exit_code); setenvf(&my_env, "SLURM_JOB_GID", "%u", job_ptr->group_id); name = gid_to_string((uid_t) job_ptr->group_id); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 8f31083f6cf..711b82940e2 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -2891,6 +2891,8 @@ void make_node_idle(struct node_record *node_ptr, "%ld seconds", job_ptr->job_id, (long) delay); job_ptr->job_state &= (~JOB_COMPLETING); + job_hold_requeue(job_ptr); + delete_step_records(job_ptr); slurm_sched_g_schedule(); } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 32a14aaa9be..d36a70c193e 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -511,6 +511,7 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, if ((agent_args->node_count - down_node_cnt) == 0) { job_ptr->job_state &= (~JOB_COMPLETING); + job_hold_requeue(job_ptr); delete_step_records(job_ptr); slurm_sched_g_schedule(); } @@ -1369,7 +1370,7 @@ static void _preempt_jobs(List preemptee_job_list, bool kill_pending, if (!kill_pending) continue; rc = job_requeue(0, job_ptr->job_id, -1, - (uint16_t)NO_VAL, true); + (uint16_t)NO_VAL, true); if (rc == SLURM_SUCCESS) { info("preempted job %u has been requeued", job_ptr->job_id); @@ -2513,6 +2514,7 @@ extern void re_kill_job(struct job_record *job_ptr) ((--job_ptr->node_cnt) == 0)) { last_node_update = time(NULL); job_ptr->job_state &= (~JOB_COMPLETING); + job_hold_requeue(job_ptr); delete_step_records(job_ptr); slurm_sched_g_schedule(); } @@ -2540,6 +2542,7 @@ extern void re_kill_job(struct job_record *job_ptr) if ((job_ptr->node_cnt > 0) && ((--job_ptr->node_cnt) == 0)) { job_ptr->job_state &= (~JOB_COMPLETING); + job_hold_requeue(job_ptr); delete_step_records(job_ptr); slurm_sched_g_schedule(); last_node_update = time(NULL); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 8135c2433a7..ed4f632fa73 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -3642,30 +3642,68 @@ inline static void _slurm_rpc_requeue(slurm_msg_t * msg) { int error_code = SLURM_SUCCESS; DEF_TIMERS; - job_id_msg_t *requeue_ptr = (job_id_msg_t *) msg->data; + requeue_msg_t *req_ptr = (requeue_msg_t *)msg->data; /* Locks: write job and node */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); + struct job_record *job_ptr; START_TIMER; - info("Processing RPC: REQUEST_REQUEUE from uid=%d", uid); + info("%s: Processing RPC: REQUEST_REQUEUE from uid=%d", __func__, uid); + + job_ptr = find_job_record(req_ptr->job_id); + if (job_ptr == NULL) { + info("%s: %u: %s", __func__, req_ptr->job_id, + slurm_strerror(ESLURM_INVALID_JOB_ID)); + return; + } lock_slurmctld(job_write_lock); - error_code = job_requeue(uid, requeue_ptr->job_id, - msg->conn_fd, msg->protocol_version, false); + error_code = job_requeue(uid, + req_ptr->job_id, + msg->conn_fd, + msg->protocol_version, + false); unlock_slurmctld(job_write_lock); END_TIMER2("_slurm_rpc_requeue"); if (error_code) { - info("_slurm_rpc_requeue %u: %s", requeue_ptr->job_id, - slurm_strerror(error_code)); - } else { - info("_slurm_rpc_requeue %u: %s", requeue_ptr->job_id, - TIME_STR); - /* Functions below provide their own locking */ - schedule_job_save(); + + if (error_code == ESLURM_TRANSITION_STATE_NO_UPDATE) { + /* The job is in state JOB_COMPLETING save the + * requested operation and carry on. The requeue + * will be done after the last job epilog completes. + */ + if (req_ptr->state & JOB_SPECIAL_EXIT) + job_ptr->job_state |= JOB_SPECIAL_EXIT; + if (req_ptr->state & JOB_REQUEUE_HOLD) + job_ptr->job_state |= JOB_REQUEUE_HOLD; + job_ptr->job_state |= JOB_REQUEUE; + + } else { + info("%s: %u: %s", __func__, req_ptr->job_id, + slurm_strerror(error_code)); + } + + return; } + + /* Requeue operation went all right, see if the user + * wants to mark the job as special case or hold it. + */ + if (req_ptr->state & JOB_SPECIAL_EXIT) + job_ptr->job_state |= JOB_SPECIAL_EXIT; + if (req_ptr->state & JOB_REQUEUE_HOLD) + job_ptr->job_state |= JOB_REQUEUE_HOLD; + + job_hold_requeue(job_ptr); + + info("%s: %u: %s", __func__, req_ptr->job_id, TIME_STR); + + /* Functions below provide their own locking + */ + schedule_job_save(); } /* Assorted checkpoint operations */ diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 1d0bb989a39..92b325131ac 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1231,9 +1231,11 @@ extern int job_req_node_filter(struct job_record *job_ptr, * IN preempt - true if job being preempted * RET 0 on success, otherwise ESLURM error code */ -extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd_t conn_fd, - uint16_t protocol_version, bool preempt); - +extern int job_requeue(uid_t uid, + uint32_t job_id, + slurm_fd_t conn_fd, + uint16_t protocol_version, + bool preempt); /* * job_step_complete - note normal completion the specified job step * IN job_id - id of the job to be completed @@ -1993,4 +1995,11 @@ extern bool validate_super_user(uid_t uid); */ extern bool validate_operator(uid_t uid); +/* job_hold_requeue() - requeue a job in hold or requeue_exit + * state. + * + * IN - job record + */ +extern void job_hold_requeue(struct job_record *job_ptr); + #endif /* !_HAVE_SLURMCTLD_H */ diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 8b16b74059b..a691caa6bf0 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -621,7 +621,7 @@ _send_exit_msg(stepd_step_rec_t *job, uint32_t *tid, int n, int status) ListIterator i = NULL; srun_info_t *srun = NULL; - debug3("sending task exit msg for %d tasks", n); + debug3("sending task exit msg for %d tasks status %d", n, status); msg.task_id_list = tid; msg.num_tasks = n; @@ -2087,7 +2087,8 @@ _send_complete_batch_script_msg(stepd_step_rec_t *job, int err, int status) req_msg.msg_type= REQUEST_COMPLETE_BATCH_SCRIPT; req_msg.data = &req; - info("sending REQUEST_COMPLETE_BATCH_SCRIPT, error:%u", err); + info("sending REQUEST_COMPLETE_BATCH_SCRIPT, error:%u status %d", + err, status); /* Note: these log messages don't go to slurmd.log from here */ for (i = 0; i <= MAX_RETRY; i++) { diff --git a/src/sview/job_info.c b/src/sview/job_info.c index 2d42fdf7a90..1052bd7a58a 100644 --- a/src/sview/job_info.c +++ b/src/sview/job_info.c @@ -3953,7 +3953,7 @@ static void process_foreach_list (jobs_foreach_common_t *jobs_foreach_common) stepid, signal); break; case EDIT_REQUEUE: - response = slurm_requeue(jobid); + response = slurm_requeue(jobid, 0); if (response) { /* stop rest of jobs */ -- GitLab