diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index ab9b74feeca1a0a78b88396f34f0aeef86784652..f59951dbce42781979b41e11e291c2602b9413b5 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -3818,6 +3818,21 @@ extern int slurm_resume2 PARAMS((char *job_id)); */ extern int slurm_requeue PARAMS((uint32_t job_id, uint32_t state)); +/* + * slurm_requeue2 - re-queue a batch job, if already running + * then terminate it first + * IN job_id in string form - job on which to perform operation + * IN state - the state in which the job should be requeued + * valid values are: + * 0 - if the job has to be requeued in JOB_PENDING state + * JOB_SPECIAL_EXIT - if the job has to be requeued in + * the special exit state and be held. + * JOB_REQUEUE_HOLD - if the job has to be requeued in + * JOB_PENDING and held state. + * RET 0 or a slurm error code + */ +extern int slurm_requeue2 PARAMS((char *job_id, uint32_t state)); + /*****************************************************************************\ * SLURM JOB CHECKPOINT FUNCTIONS \*****************************************************************************/ diff --git a/src/api/suspend.c b/src/api/suspend.c index 847aa9ded094a7bb406e63b71d3a800dad337944..f18185e16a39efd7cdad3119dca8bbe9e88b3ff4 100644 --- a/src/api/suspend.c +++ b/src/api/suspend.c @@ -49,7 +49,6 @@ * IN op - operation to perform * IN job_id - job on which to perform operation or NO_VAL * IN job_id_str - job on which to perform operation in string format or NULL - * IN step_id - job step on which to perform operation * RET 0 or a slurm error code * NOTE: Supply either job_id NO_VAL or job_id_str as NULL, not both */ @@ -113,14 +112,16 @@ extern int slurm_resume2(char *job_id) return _suspend_op(RESUME_JOB, NO_VAL, job_id); } + /* - * slurm_requeue - re-queue a batch job, if already running - * then terminate it first - * IN job_id - job on which to perform operation + * _requeue_op - perform a requeue operation for some job. + * IN state - state in which to place the job + * IN job_id - job on which to perform operation or NO_VAL + * IN job_id_str - job on which to perform operation in string format or NULL * RET 0 or a slurm error code + * NOTE: Supply either job_id NO_VAL or job_id_str as NULL, not both */ -extern int slurm_requeue(uint32_t job_id, - uint32_t state) +static int _requeue_op(uint32_t state, uint32_t job_id, char *job_id_str) { int rc; requeue_msg_t requeue_req; @@ -129,7 +130,8 @@ extern int slurm_requeue(uint32_t job_id, slurm_msg_t_init(&req_msg); requeue_req.job_id = job_id; - requeue_req.state = state; + requeue_req.job_id_str = job_id_str; + requeue_req.state = state; req_msg.msg_type = REQUEST_JOB_REQUEUE; req_msg.data = &requeue_req; @@ -139,3 +141,27 @@ extern int slurm_requeue(uint32_t job_id, slurm_seterrno(rc); return rc; } + +/* + * slurm_requeue - re-queue a batch job, if already running + * then terminate it first + * IN job_id - job on which to perform operation + * IN state - state in which to place the job + * RET 0 or a slurm error code + */ +extern int slurm_requeue(uint32_t job_id, uint32_t state) +{ + return _requeue_op(state, job_id, NULL); +} + +/* + * slurm_requeue2 - re-queue a batch job, if already running + * then terminate it first + * IN job_id_str - job on which to perform operation in string format or NULL + * IN state - state in which to place the job + * RET 0 or a slurm error code + */ +extern int slurm_requeue2(char *job_id_str, uint32_t state) +{ + return _requeue_op(state, 0, job_id_str); +} diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 6d49df48eeeceae7850c4484d58f9e6a332271bb..ad2fc4fd848cf969bbe4508e1961026362c1d62e 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -959,7 +959,10 @@ extern void slurm_free_suspend_msg(suspend_msg_t *msg) extern void slurm_free_requeue_msg(requeue_msg_t *msg) { - xfree(msg); + if (msg) { + xfree(msg->job_id_str); + xfree(msg); + } } extern void slurm_free_suspend_int_msg(suspend_int_msg_t *msg) diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index f3e6dfa7791add4ae55629c5e32ed4bfd7eb1fb0..b9a711704ecaf21918c40ab35d4b0e3c4019b63b 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -1047,7 +1047,8 @@ typedef struct slurm_node_registration_status_msg { } slurm_node_registration_status_msg_t; typedef struct requeue_msg { - uint32_t job_id; /* slurm job_id */ + uint32_t job_id; /* slurm job ID (number) */ + char * job_id_str; /* slurm job ID (string) */ uint32_t state; /* JobExitRequeue | Hold */ } requeue_msg_t; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 9c67bec86457ee14f10185dd76d18789a7ee13f8..9a745f1ed1e2861bd4c56bb18ac38bb81b412609 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -10307,7 +10307,11 @@ _pack_job_requeue_msg(requeue_msg_t *msg, Buf buf, uint16_t protocol_version) { xassert(msg != NULL); - if (protocol_version >= SLURM_14_03_PROTOCOL_VERSION) { + if (protocol_version >= SLURM_14_11_PROTOCOL_VERSION) { + pack32(msg->job_id, buf); + packstr(msg->job_id_str, buf); + pack32(msg->state, buf); + } else if (protocol_version >= SLURM_14_03_PROTOCOL_VERSION) { pack32(msg->job_id, buf); pack32(msg->state, buf); } else { @@ -10323,9 +10327,14 @@ _pack_job_requeue_msg(requeue_msg_t *msg, Buf buf, uint16_t protocol_version) static int _unpack_job_requeue_msg(requeue_msg_t **msg, Buf buf, uint16_t protocol_version) { + uint32_t uint32_tmp = 0; *msg = xmalloc(sizeof(requeue_msg_t)); - if (protocol_version >= SLURM_14_03_PROTOCOL_VERSION) { + if (protocol_version >= SLURM_14_11_PROTOCOL_VERSION) { + safe_unpack32(&(*msg)->job_id, buf); + safe_unpackstr_xmalloc(&(*msg)->job_id_str, &uint32_tmp, buf); + safe_unpack32(&(*msg)->state, buf); + } else if (protocol_version >= SLURM_14_03_PROTOCOL_VERSION) { safe_unpack32(&(*msg)->job_id, buf); safe_unpack32(&(*msg)->state, buf); } else { diff --git a/src/plugins/sched/wiki2/job_requeue.c b/src/plugins/sched/wiki2/job_requeue.c index 767ccf5c4e0c24a3e3acfaaaed99ced5fb0cf671..64f1a0ac4854e60f66beada963d034eb6899e52d 100644 --- a/src/plugins/sched/wiki2/job_requeue.c +++ b/src/plugins/sched/wiki2/job_requeue.c @@ -68,7 +68,7 @@ extern int job_requeue_wiki(char *cmd_ptr, int *err_code, char **err_msg) } lock_slurmctld(job_write_lock); - slurm_rc = job_requeue(0, jobid, -1, (uint16_t)NO_VAL, false); + slurm_rc = job_requeue(0, jobid, -1, (uint16_t)NO_VAL, false, 0); if (slurm_rc != SLURM_SUCCESS) { unlock_slurmctld(job_write_lock); *err_code = -700; diff --git a/src/plugins/select/bluegene/bg_core.c b/src/plugins/select/bluegene/bg_core.c index 63c3b5ce09e129b6a35b499e49ee1eb2ec240d72..2cf59a6e6e258fd7f7bf5c9962fc2b4beddb2c0a 100644 --- a/src/plugins/select/bluegene/bg_core.c +++ b/src/plugins/select/bluegene/bg_core.c @@ -337,7 +337,7 @@ extern void bg_requeue_job(uint32_t job_id, bool wait_for_start, if (!slurmctld_locked) lock_slurmctld(job_write_lock); - if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, preempted))) { + if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, preempted, 0))) { error("Couldn't requeue job %u, failing it: %s", job_id, slurm_strerror(rc)); job_fail(job_id, job_state); diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index f0fbd59d3b07d57c8cdccf57cadc9ec1924cbb60..260146617050a8adec21b1a3bb08a97d4a44a29c 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -881,14 +881,8 @@ _process_command (int argc, char *argv[]) "too few arguments for keyword:%s\n", tag); } else { - error_code = scontrol_requeue((argc - 1), &argv[1]); - if (error_code) { - exit_code = 1; - if (quiet_flag != 1) - slurm_perror ("slurm_requeue error"); - } + scontrol_requeue((argc - 1), &argv[1]); } - } else if (strncasecmp(tag, "requeuehold", 11) == 0) { if (argc > 3) { @@ -904,12 +898,7 @@ _process_command (int argc, char *argv[]) "too few arguments for keyword:%s\n", tag); } else { - error_code = scontrol_requeue_hold((argc - 1), &argv[1]); - if (error_code) { - exit_code = 1; - if (quiet_flag != 1) - slurm_perror ("slurm_requeue error"); - } + scontrol_requeue_hold((argc - 1), &argv[1]); } } diff --git a/src/scontrol/scontrol.h b/src/scontrol/scontrol.h index 63c762ff47412905be765d8a888aa2332f1aa6e1..341c0202adcac6faaed6699090d4fc397c68fd45 100644 --- a/src/scontrol/scontrol.h +++ b/src/scontrol/scontrol.h @@ -152,8 +152,8 @@ extern void scontrol_print_block (char *block_name); extern void scontrol_print_res (char *reservation_name); extern void scontrol_print_step (char *job_step_id_str); extern void scontrol_print_topo (char *node_list); -extern int scontrol_requeue(int argc, char **argv); -extern int scontrol_requeue_hold(int argc, char **argv); +extern void scontrol_requeue(int argc, char **argv); +extern void scontrol_requeue_hold(int argc, char **argv); extern void scontrol_suspend(char *op, char *job_id_str); extern int scontrol_update_front_end (int argc, char *argv[]); extern int scontrol_update_job (int argc, char *argv[]); diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index a283a85685d3b87c4e308981617996a8d3a1b7f2..dbd722f7bf2361c796107dc78e2d623da53e5ed2 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -372,7 +372,7 @@ scontrol_suspend(char *op, char *job_id_str) xstrfmtcat(this_job_id, "%u_%u", ids[i].array_job_id, ids[i].array_task_id); } else { - xstrfmtcat(this_job_id, "%u", ids[i].array_job_id); + xstrfmtcat(this_job_id, "%u", ids[i].job_id); } if (strncasecmp(op, "suspend", MAX(strlen(op), 2)) == 0) cc = slurm_suspend2(this_job_id); @@ -393,20 +393,19 @@ scontrol_suspend(char *op, char *job_id_str) /* * scontrol_requeue - requeue a pending or running batch job * IN job_id_str - a job id - * RET 0 if no slurm error, errno otherwise. parsing error prints - * error message and returns 0 */ -extern int +extern void scontrol_requeue(int argc, char **argv) { int rc = SLURM_SUCCESS; int i; job_ids_t *ids; uint32_t num_ids = 0; + char *this_job_id = NULL; - if (! argv[0]) { + if (!argv[0]) { exit_code = 1; - return 0; + return; } if (strncasecmp(argv[0], "jobid=", 6) == 0) @@ -417,32 +416,33 @@ scontrol_requeue(int argc, char **argv) ids = _get_job_ids(argv[0], &num_ids); if (ids == NULL) { exit_code = 1; - return 0; + return; } for (i = 0; i < num_ids; i++) { - rc = slurm_requeue(ids[i].job_id, 0); - if (rc != SLURM_SUCCESS) { - if (ids[i].array_task_id != NO_VAL) { - fprintf(stderr, "%s for job %u_%u (%u)\n", - slurm_strerror(slurm_get_errno()), - ids[i].array_job_id, - ids[i].array_task_id, - ids[i].job_id); - } else { - fprintf(stderr, "%s for job %u\n", - slurm_strerror(slurm_get_errno()), - ids[i].job_id); - } + if (ids[i].array_task_str) { + xstrfmtcat(this_job_id, "%u_%s", + ids[i].array_job_id, ids[i].array_task_str); + } else if (ids[i].array_task_id != NO_VAL) { + xstrfmtcat(this_job_id, "%u_%u", + ids[i].array_job_id, ids[i].array_task_id); + } else { + xstrfmtcat(this_job_id, "%u", ids[i].job_id); } + rc = slurm_requeue2(this_job_id, 0); + if (rc != SLURM_SUCCESS) + exit_code = 1; + if ((rc != SLURM_SUCCESS) && (quiet_flag != 1)) { + fprintf(stderr, "%s for job %s\n", + slurm_strerror(slurm_get_errno()), this_job_id); + } + xfree(this_job_id); } _free_job_ids(ids, num_ids); - - return rc; } -extern int +extern void scontrol_requeue_hold(int argc, char **argv) { int rc = SLURM_SUCCESS; @@ -450,7 +450,7 @@ scontrol_requeue_hold(int argc, char **argv) uint32_t state_flag; job_ids_t *ids; uint32_t num_ids; - char *job_id_str; + char *job_id_str, *this_job_id = NULL; state_flag = 0; @@ -467,7 +467,7 @@ scontrol_requeue_hold(int argc, char **argv) ids = _get_job_ids(job_id_str, &num_ids); if (ids == NULL) { exit_code = 1; - return 0; + return; } if (argc == 2) { @@ -476,7 +476,7 @@ scontrol_requeue_hold(int argc, char **argv) error("Invalid state specification %s", argv[0]); exit_code = 1; _free_job_ids(ids, num_ids); - return 0; + return; } } state_flag |= JOB_REQUEUE_HOLD; @@ -485,25 +485,26 @@ scontrol_requeue_hold(int argc, char **argv) * JOB_SPECIAL_EXIT or HELD state. */ for (i = 0; i < num_ids; i++) { - rc = slurm_requeue(ids[i].job_id, state_flag); - if (rc != SLURM_SUCCESS) { - if (ids[i].array_task_id != NO_VAL) { - fprintf(stderr, "%s for job %u_%u (%u)\n", - slurm_strerror(slurm_get_errno()), - ids[i].array_job_id, - ids[i].array_task_id, - ids[i].job_id); - } else { - fprintf(stderr, "%s for job %u\n", - slurm_strerror(slurm_get_errno()), - ids[i].job_id); - } + if (ids[i].array_task_str) { + xstrfmtcat(this_job_id, "%u_%s", + ids[i].array_job_id, ids[i].array_task_str); + } else if (ids[i].array_task_id != NO_VAL) { + xstrfmtcat(this_job_id, "%u_%u", + ids[i].array_job_id, ids[i].array_task_id); + } else { + xstrfmtcat(this_job_id, "%u", ids[i].array_job_id); } + rc = slurm_requeue2(this_job_id, state_flag); + if (rc != SLURM_SUCCESS) + exit_code = 1; + if ((rc != SLURM_SUCCESS) && (quiet_flag != 1)) { + fprintf(stderr, "%s for job %s\n", + slurm_strerror(slurm_get_errno()), this_job_id); + } + xfree(this_job_id); } _free_job_ids(ids, num_ids); - - return rc; } /* diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c index b7f98ced17c20614e9a518c2c2e3eac7e4611ee5..631138a5c02a0fab03fdd1cf7f170166f0cfb94a 100644 --- a/src/slurmctld/gang.c +++ b/src/slurmctld/gang.c @@ -657,7 +657,7 @@ static void _preempt_job_dequeue(void) job_ptr->batch_flag && job_ptr->details && (job_ptr->details->requeue > 0)) { rc = job_requeue(0, job_ptr->job_id, -1, - (uint16_t)NO_VAL, true); + (uint16_t)NO_VAL, true, 0); if (rc == SLURM_SUCCESS) { info("preempted job %u has been requeued", job_ptr->job_id); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index eaa8e2b54164154fb9356675385e5d307ceb3930..f5bfcdc73d3ce46c9883d978d6813ce092f2146c 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -11537,7 +11537,8 @@ extern int job_suspend2(suspend_msg_t *sus_ptr, uid_t uid, } if (!valid) { info("job_suspend2: invalid job id %s", sus_ptr->job_id_str); - return ESLURM_INVALID_JOB_ID; + rc = ESLURM_INVALID_JOB_ID; + goto reply; } i_first = bit_ffs(array_bitmap); @@ -11572,41 +11573,24 @@ extern int job_suspend2(suspend_msg_t *sus_ptr, uid_t uid, } /* - * job_requeue - Requeue a running or pending batch job + * _job_requeue - Requeue a running or pending batch job * IN uid - user id of user issuing the RPC - * IN job_id - id of the job to be requeued - * IN conn_fd - file descriptor on which to send reply - * IN protocol_version - slurm protocol version of client + * IN job_ptr - job to be requeued * IN preempt - true if job being preempted * RET 0 on success, otherwise ESLURM error code */ -extern int job_requeue(uid_t uid, - uint32_t job_id, - slurm_fd_t conn_fd, - uint16_t protocol_version, - bool preempt) +static int _job_requeue(uid_t uid, struct job_record *job_ptr, bool preempt, + uint32_t state) { - int rc = SLURM_SUCCESS; - struct job_record *job_ptr = NULL; bool suspended = false; - slurm_msg_t resp_msg; - return_code_msg_t rc_msg; time_t now = time(NULL); bool is_running; - /* find the job */ - job_ptr = find_job_record(job_id); - if (job_ptr == NULL) { - rc = ESLURM_INVALID_JOB_ID; - goto reply; - } - /* validate the request */ if ((uid != job_ptr->user_id) && !validate_operator(uid) && !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account)) { - rc = ESLURM_ACCESS_DENIED; - goto reply; + return ESLURM_ACCESS_DENIED; } /* If the partition was removed don't allow the job to be @@ -11615,8 +11599,7 @@ extern int job_requeue(uid_t uid, */ if (!job_ptr->part_ptr || !job_ptr->details || !job_ptr->details->requeue) { - rc = ESLURM_DISABLED; - goto reply; + return ESLURM_DISABLED; } /* In the job is in the process of completing @@ -11628,21 +11611,18 @@ extern int job_requeue(uid_t uid, uint32_t flags; flags = job_ptr->job_state & JOB_STATE_FLAGS; job_ptr->job_state = JOB_PENDING | flags; - goto reply; + return SLURM_SUCCESS; } /* If the job is already pending do nothing * and return is well to the library. */ - if (IS_JOB_PENDING(job_ptr)) { - rc = ESLURM_JOB_PENDING; - goto reply; - } + if (IS_JOB_PENDING(job_ptr)) + return ESLURM_JOB_PENDING; if (job_ptr->batch_flag == 0) { debug("Job-requeue can only be done for batch jobs"); - rc = ESLURM_BATCH_ONLY; - goto reply; + return ESLURM_BATCH_ONLY; } slurm_sched_g_requeue(job_ptr, "Job requeued by user/admin"); @@ -11670,8 +11650,7 @@ extern int job_requeue(uid_t uid, * running state. */ is_running = false; - if (IS_JOB_SUSPENDED(job_ptr) - || IS_JOB_RUNNING(job_ptr)) + if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) is_running = true; /* We want this job to have the requeued state in the @@ -11706,7 +11685,176 @@ extern int job_requeue(uid_t uid, * to add it again. */ acct_policy_add_job_submit(job_ptr); -reply: + if (state & JOB_SPECIAL_EXIT) { + job_ptr->job_state |= JOB_SPECIAL_EXIT; + job_ptr->state_reason = WAIT_HELD_USER; + job_ptr->priority = 0; + } + if (state & JOB_REQUEUE_HOLD) { + job_ptr->state_reason = WAIT_HELD_USER; + job_ptr->priority = 0; + } + + debug("%s: job %u state 0x%x reason %u priority %d", __func__, + job_ptr->job_id, job_ptr->job_state, + job_ptr->state_reason, job_ptr->priority); + + return SLURM_SUCCESS; +} + +/* + * job_requeue - Requeue a running or pending batch job + * IN uid - user id of user issuing the RPC + * IN job_id - id of the job to be requeued + * IN conn_fd - file descriptor on which to send reply + * IN protocol_version - slurm protocol version of client + * IN preempt - true if job being preempted + * IN state - may be set to JOB_SPECIAL_EXIT and/or JOB_REQUEUE_HOLD + * RET 0 on success, otherwise ESLURM error code + */ +extern int job_requeue(uid_t uid, uint32_t job_id, + slurm_fd_t conn_fd, uint16_t protocol_version, + bool preempt, uint32_t state) +{ + int rc = SLURM_SUCCESS; + struct job_record *job_ptr = NULL; + slurm_msg_t resp_msg; + return_code_msg_t rc_msg; + + /* find the job */ + job_ptr = find_job_record(job_id); + if (job_ptr == NULL) { + rc = ESLURM_INVALID_JOB_ID; + } else { + rc = _job_requeue(uid, job_ptr, preempt, state); + } + + if (conn_fd >= 0) { + slurm_msg_t_init(&resp_msg); + resp_msg.protocol_version = protocol_version; + resp_msg.msg_type = RESPONSE_SLURM_RC; + rc_msg.return_code = rc; + resp_msg.data = &rc_msg; + slurm_send_node_msg(conn_fd, &resp_msg); + } + return rc; +} + +/* + * job_requeue2 - Requeue a running or pending batch job + * IN uid - user id of user issuing the RPC + * IN req_ptr - request including ID of the job to be requeued + * IN conn_fd - file descriptor on which to send reply + * IN protocol_version - slurm protocol version of client + * IN preempt - true if job being preempted + * RET 0 on success, otherwise ESLURM error code + */ +extern int job_requeue2(uid_t uid, requeue_msg_t *req_ptr, + slurm_fd_t conn_fd, uint16_t protocol_version, + bool preempt) +{ + static uint32_t max_array_size = NO_VAL; + slurm_ctl_conf_t *conf; + int rc = SLURM_SUCCESS, rc2; + struct job_record *job_ptr = NULL; + long int long_id; + uint32_t job_id; + char *end_ptr = NULL, *tok, *tmp; + bitstr_t *array_bitmap; + bool valid = true; + int32_t i, i_first, i_last; + slurm_msg_t resp_msg; + return_code_msg_t rc_msg; + uint32_t state = req_ptr->state; + char *job_id_str = req_ptr->job_id_str; + + if (max_array_size == NO_VAL) { + conf = slurm_conf_lock(); + max_array_size = conf->max_array_sz; + slurm_conf_unlock(); + } + + long_id = strtol(job_id_str, &end_ptr, 10); + if ((long_id <= 0) || (long_id == LONG_MAX) || + ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) { + info("job_requeue2: invalid job id %s", job_id_str); + rc = ESLURM_INVALID_JOB_ID; + goto reply; + } + job_id = (uint32_t) long_id; + if (end_ptr[0] == '\0') { /* Single job (or full job array) */ + struct job_record *job_ptr_done = NULL; + job_ptr = find_job_record(job_id); + if (job_ptr && (job_ptr->array_task_id == NO_VAL) && + (job_ptr->array_recs == NULL)) { + /* This is a regular job, not a job array */ + rc = _job_requeue(uid, job_ptr, preempt, state); + goto reply; + } + + if (job_ptr && job_ptr->array_recs) { + /* This is a job array */ + rc = _job_requeue(uid, job_ptr, preempt, state); + job_ptr_done = job_ptr; + } + + /* Requeue all tasks of this job array */ + job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; + if (!job_ptr && !job_ptr_done) { + rc = ESLURM_INVALID_JOB_ID; + goto reply; + } + while (job_ptr) { + if ((job_ptr->array_job_id == job_id) && + (job_ptr != job_ptr_done)) { + rc2 = _job_requeue(uid, job_ptr, preempt,state); + rc = MAX(rc, rc2); + } + job_ptr = job_ptr->job_array_next_j; + } + goto reply; + } + + array_bitmap = bit_alloc(max_array_size); + tmp = xstrdup(end_ptr + 1); + tok = strtok_r(tmp, ",", &end_ptr); + while (tok && valid) { + valid = _parse_array_tok(tok, array_bitmap, + max_array_size); + tok = strtok_r(NULL, ",", &end_ptr); + } + xfree(tmp); + if (valid) { + i_last = bit_fls(array_bitmap); + if (i_last < 0) + valid = false; + } + if (!valid) { + info("job_requeue2: invalid job id %s", job_id_str); + rc = ESLURM_INVALID_JOB_ID; + goto reply; + } + + i_first = bit_ffs(array_bitmap); + if (i_first >= 0) + i_last = bit_fls(array_bitmap); + else + i_last = -2; + for (i = i_first; i <= i_last; i++) { + if (!bit_test(array_bitmap, i)) + continue; + job_ptr = find_job_array_rec(job_id, i); + if (job_ptr == NULL) { + info("job_requeue2: invalid job id %u_%d", job_id, i); + rc = ESLURM_INVALID_JOB_ID; + continue; + } + + rc2 = _job_requeue(uid, job_ptr, preempt, state); + rc = MAX(rc, rc2); + } + + reply: if (conn_fd >= 0) { slurm_msg_t_init(&resp_msg); resp_msg.protocol_version = protocol_version; diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index ee322385902e9568643f9aef244f4fc41062bc07..fa4ccabd71be1ebff91152a59d8272768ae166e0 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -2952,7 +2952,7 @@ static void *_run_prolog(void *arg) job_id, WEXITSTATUS(status), WTERMSIG(status)); lock_slurmctld(job_write_lock); if ((rc = job_requeue(0, job_id, -1, (uint16_t) NO_VAL, - false))) { + false, 0))) { info("unable to requeue job %u: %m", job_id); kill_job = true; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index dc95476fce2dfc8623c8ca94b4256c548b0f7558..52b078cefb445d88e8b8b123898b5096f638e34c 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1487,7 +1487,7 @@ static void _preempt_jobs(List preemptee_job_list, bool kill_pending, if (!kill_pending) continue; rc = job_requeue(0, job_ptr->job_id, -1, - (uint16_t)NO_VAL, true); + (uint16_t)NO_VAL, true, 0); if (rc == SLURM_SUCCESS) { info("preempted job %u has been requeued", job_ptr->job_id); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index a11a800cfb14e83118f467875f714abce9b07947..e87fefc5afa5d0c82357c0d87335e251e9ae31b2 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -447,7 +447,7 @@ void slurmctld_req(slurm_msg_t *msg, connection_arg_t *arg) break; case REQUEST_JOB_REQUEUE: _slurm_rpc_requeue(msg); - slurm_free_job_id_msg(msg->data); + slurm_free_requeue_msg(msg->data); break; case REQUEST_JOB_READY: _slurm_rpc_job_ready(msg); @@ -3839,63 +3839,29 @@ inline static void _slurm_rpc_requeue(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); - struct job_record *job_ptr; START_TIMER; - info("%s: Processing RPC: REQUEST_REQUEUE from uid=%d", __func__, uid); - - job_ptr = find_job_record(req_ptr->job_id); - if (job_ptr == NULL) { - slurm_msg_t resp_msg; - return_code_msg_t rc_msg; - - info("%s: %u: %s", __func__, req_ptr->job_id, - slurm_strerror(ESLURM_INVALID_JOB_ID)); - - slurm_msg_t_init(&resp_msg); - resp_msg.protocol_version = msg->protocol_version; - resp_msg.msg_type = RESPONSE_SLURM_RC; - rc_msg.return_code = ESLURM_INVALID_JOB_ID; - resp_msg.data = &rc_msg; - slurm_send_node_msg(msg->conn_fd, &resp_msg); - - return; - } + info("%s: Processing RPC: REQUEST_JOB_REQUEUE from uid=%d", __func__, + uid); +info("Req:%u:%s:", req_ptr->job_id, req_ptr->job_id_str); lock_slurmctld(job_write_lock); - error_code = job_requeue(uid, - req_ptr->job_id, - msg->conn_fd, - msg->protocol_version, - false); + if (req_ptr->job_id_str) { + error_code = job_requeue2(uid, req_ptr, msg->conn_fd, + msg->protocol_version, false); + } else { + error_code = job_requeue(uid, req_ptr->job_id, msg->conn_fd, + msg->protocol_version, false, + req_ptr->state); + } unlock_slurmctld(job_write_lock); END_TIMER2("_slurm_rpc_requeue"); if (error_code) { info("%s: %u: %s", __func__, req_ptr->job_id, slurm_strerror(error_code)); - return; } - /* Requeue operation went all right, see if the user - * wants to mark the job as special case or hold it. - */ - if (req_ptr->state & JOB_SPECIAL_EXIT) { - job_ptr->job_state |= JOB_SPECIAL_EXIT; - job_ptr->state_reason = WAIT_HELD_USER; - job_ptr->priority = 0; - } - if (req_ptr->state & JOB_REQUEUE_HOLD) { - job_ptr->state_reason = WAIT_HELD_USER; - job_ptr->priority = 0; - } - - debug("%s: job %u state 0x%x reason %u priority %d", __func__, - job_ptr->job_id, job_ptr->job_state, - job_ptr->state_reason, job_ptr->priority); - - info("%s: %u: %s", __func__, req_ptr->job_id, TIME_STR); - /* Functions below provide their own locking */ schedule_job_save(); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 5c8fec30e0e0132a7de02228606b0084e1195542..395e2ce875f7593e5f2d0b74bcd0ade6eb6147e0 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1314,13 +1314,26 @@ extern int job_req_node_filter(struct job_record *job_ptr, * IN conn_fd - file descriptor on which to send reply * IN protocol_version - slurm protocol version of client * IN preempt - true if job being preempted + * IN state - may be set to JOB_SPECIAL_EXIT and/or JOB_REQUEUE_HOLD * RET 0 on success, otherwise ESLURM error code */ -extern int job_requeue(uid_t uid, - uint32_t job_id, - slurm_fd_t conn_fd, - uint16_t protocol_version, +extern int job_requeue(uid_t uid, uint32_t job_id, + slurm_fd_t conn_fd, uint16_t protocol_version, + bool preempt, uint32_t state); + +/* + * job_requeue2 - Requeue a running or pending batch job + * IN uid - user id of user issuing the RPC + * IN req_ptr - request including ID of the job to be requeued + * IN conn_fd - file descriptor on which to send reply + * IN protocol_version - slurm protocol version of client + * IN preempt - true if job being preempted + * RET 0 on success, otherwise ESLURM error code + */ +extern int job_requeue2(uid_t uid, requeue_msg_t *req_ptr, + slurm_fd_t conn_fd, uint16_t protocol_version, bool preempt); + /* * job_step_complete - note normal completion the specified job step * IN job_id - id of the job to be completed