From 547da7f728a0de3623b9139f3c1491fae3fb31f8 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Tue, 15 Jul 2014 16:30:22 -0700 Subject: [PATCH] Add suspend array back-end --- slurm/slurm.h.in | 5 ++ src/slurmctld/job_mgr.c | 144 +++++++++++++++++++++++++++++++++++++- src/slurmctld/slurmctld.h | 3 + 3 files changed, 151 insertions(+), 1 deletion(-) diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index d559d8d19ef..d5356852bbf 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1427,6 +1427,11 @@ typedef struct suspend_msg { uint32_t job_id; /* slurm job_id */ } suspend_msg_t; +typedef struct suspend_msg2 { + uint16_t op; /* suspend operation, see enum suspend_opts */ + char *job_id_str; /* slurm job_id string */ +} suspend_msg2_t; + typedef struct { uint16_t ckpt_interval; /* checkpoint interval in minutes */ uint32_t cpu_count; /* number of required processors */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 0436f978026..c8f03c4eda8 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -11220,6 +11220,14 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, goto reply; #endif + /* validate the request */ + if ((uid != 0) && (uid != getuid())) { + error("SECURITY VIOLATION: Attempt to suspend job from user %u", + (int) uid); + rc = ESLURM_ACCESS_DENIED; + goto reply; + } + /* find the job */ job_ptr = find_job_record (sus_ptr->job_id); if (job_ptr == NULL) { @@ -11227,13 +11235,147 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, goto reply; } + rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp); + + reply: + if (conn_fd >= 0) { + slurm_msg_t_init(&resp_msg); + resp_msg.protocol_version = protocol_version; + resp_msg.msg_type = RESPONSE_SLURM_RC; + rc_msg.return_code = rc; + resp_msg.data = &rc_msg; + slurm_send_node_msg(conn_fd, &resp_msg); + } + return rc; +} + +/* + * job_suspend2 - perform some suspend/resume operation + * IN sus_ptr - suspend/resume request message + * IN uid - user id of the user issuing the RPC + * IN conn_fd - file descriptor on which to send reply, + * -1 if none + * indf_susp IN - set if job is being suspended indefinitely by user or admin + * and we should clear it's priority, otherwise suspended + * temporarily for gang scheduling + * IN protocol_version - slurm protocol version of client + * RET 0 on success, otherwise ESLURM error code + */ +extern int job_suspend2(suspend_msg2_t *sus_ptr, uid_t uid, + slurm_fd_t conn_fd, bool indf_susp, + uint16_t protocol_version) +{ + static uint32_t max_array_size = NO_VAL; + slurm_ctl_conf_t *conf; + int rc = SLURM_SUCCESS, rc2; + struct job_record *job_ptr = NULL; + long int long_id; + uint32_t job_id; + char *end_ptr = NULL, *tok, *tmp; + bitstr_t *array_bitmap; + bool valid = true; + int32_t i, i_first, i_last; + slurm_msg_t resp_msg; + return_code_msg_t rc_msg; + +#ifdef HAVE_BG + rc = ESLURM_NOT_SUPPORTED; + goto reply; +#endif + + if (max_array_size == NO_VAL) { + conf = slurm_conf_lock(); + max_array_size = conf->max_array_sz; + slurm_conf_unlock(); + } + /* validate the request */ if ((uid != 0) && (uid != getuid())) { + error("SECURITY VIOLATION: Attempt to suspend job from user %u", + (int) uid); rc = ESLURM_ACCESS_DENIED; goto reply; } - rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp); + long_id = strtol(sus_ptr->job_id_str, &end_ptr, 10); + if ((long_id <= 0) || (long_id == LONG_MAX) || + ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) { + info("job_suspend2: invalid job id %s", sus_ptr->job_id_str); + rc = ESLURM_INVALID_JOB_ID; + goto reply; + } + job_id = (uint32_t) long_id; + if (end_ptr[0] == '\0') { /* Single job (or full job array) */ + struct job_record *job_ptr_done = NULL; + job_ptr = find_job_record(job_id); + if (job_ptr && (job_ptr->array_task_id == NO_VAL) && + (job_ptr->array_recs == NULL)) { + /* This is a regular job, not a job array */ + rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp); + goto reply; + } + + if (job_ptr && job_ptr->array_recs) { + /* This is a job array */ + rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp); + job_ptr_done = job_ptr; + } + + /* Suspend all tasks of this job array */ + job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; + if (!job_ptr && !job_ptr_done) { + rc = ESLURM_INVALID_JOB_ID; + goto reply; + } + while (job_ptr) { + if ((job_ptr->array_job_id == job_id) && + (job_ptr != job_ptr_done)) { + rc2 = _job_suspend(job_ptr, sus_ptr->op, + indf_susp); + rc = MAX(rc, rc2); + } + job_ptr = job_ptr->job_array_next_j; + } + goto reply; + } + + array_bitmap = bit_alloc(max_array_size); + tmp = xstrdup(end_ptr + 1); + tok = strtok_r(tmp, ",", &end_ptr); + while (tok && valid) { + valid = _parse_array_tok(tok, array_bitmap, + max_array_size); + tok = strtok_r(NULL, ",", &end_ptr); + } + xfree(tmp); + if (valid) { + i_last = bit_fls(array_bitmap); + if (i_last < 0) + valid = false; + } + if (!valid) { + info("job_suspend2: invalid job id %s", sus_ptr->job_id_str); + return ESLURM_INVALID_JOB_ID; + } + + i_first = bit_ffs(array_bitmap); + if (i_first >= 0) + i_last = bit_fls(array_bitmap); + else + i_last = -2; + for (i = i_first; i <= i_last; i++) { + if (!bit_test(array_bitmap, i)) + continue; + job_ptr = find_job_array_rec(job_id, i); + if (job_ptr == NULL) { + info("job_suspend2: invalid job id %u_%d", job_id, i); + rc = ESLURM_INVALID_JOB_ID; + continue; + } + + rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp); + rc = MAX(rc, rc2); + } reply: if (conn_fd >= 0) { diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 69a94cdd93c..6bfc12cd589 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1269,6 +1269,9 @@ extern int job_str_signal(char *job_id_str, uint16_t signal, uint16_t flags, extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, slurm_fd_t conn_fd, bool indf_susp, uint16_t protocol_version); +extern int job_suspend2(suspend_msg2_t *sus_ptr, uid_t uid, + slurm_fd_t conn_fd, bool indf_susp, + uint16_t protocol_version); /* * job_complete - note the normal termination the specified job -- GitLab