diff --git a/NEWS b/NEWS index 82b631639e0d86f21e9e6e35068cf38285e29b0a..d39221639128baacf527cacd39e653f3651a835d 100644 --- a/NEWS +++ b/NEWS @@ -144,6 +144,12 @@ documents those changes that are of interest to users and admins. you query against that with -N and -E you will get all jobs during that time instead of only the ones running on -N. -- BGP - Fix for HTC mode + -- Accounting - If a job start message fails to the SlurmDBD reset the db_inx + so it gets sent again. This isn't a major problem since the start will + happen when the job ends, but this does make things cleaner. + -- If an salloc is waiting for an allocation to happen and is canceled by the + user mark the state canceled instead of completed. + -- Fix issue in accounting if a user puts a '\' in their job name. * Changes in SLURM 2.4.4 ======================== diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index e1b9a583671bd2dea4918e8f7fb0f84586b03022..5aa62a482b09260e7bb7a064b1de1bfd5ae7e97b 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -155,7 +155,7 @@ extern char *slurm_add_slash_to_quotes(char *str) /* make a buffer 2 times the size just to be safe */ copy = dup = xmalloc((2 * len) + 1); if (copy) - do if (*str == '\'' || *str == '"') + do if (*str == '\\' || *str == '\'' || *str == '"') *dup++ = '\\'; while ((*dup++ = *str++)); diff --git a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c index bfa529b3aa1030eb8ec94380275169b7baf5bd0a..c69bb40e226e746dc5f471da2e54c8014c663ba1 100644 --- a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c +++ b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c @@ -295,6 +295,7 @@ static void *_set_db_inx_thread(void *no_data) slurmdbd_msg_t req, resp; dbd_list_msg_t send_msg, *got_msg; int rc = SLURM_SUCCESS; + bool reset = 0; memset(&send_msg, 0, sizeof(dbd_list_msg_t)); @@ -305,20 +306,23 @@ static void *_set_db_inx_thread(void *no_data) rc = slurm_send_recv_slurmdbd_msg( SLURMDBD_VERSION, &req, &resp); list_destroy(local_job_list); - if (rc != SLURM_SUCCESS) + if (rc != SLURM_SUCCESS) { error("slurmdbd: DBD_SEND_MULT_JOB_START " "failure: %m"); - else if (resp.msg_type == DBD_RC) { + reset = 1; + } else if (resp.msg_type == DBD_RC) { dbd_rc_msg_t *msg = resp.data; if (msg->return_code == SLURM_SUCCESS) { info("%s", msg->comment); } else error("%s", msg->comment); slurmdbd_free_rc_msg(msg); + reset = 1; } else if (resp.msg_type != DBD_GOT_MULT_JOB_START) { error("slurmdbd: response type not " "DBD_GOT_MULT_JOB_START: %u", resp.msg_type); + reset = 1; } else { dbd_id_rc_msg_t *id_ptr = NULL; got_msg = (dbd_list_msg_t *) resp.data; @@ -335,6 +339,19 @@ static void *_set_db_inx_thread(void *no_data) slurmdbd_free_list_msg(got_msg); } + + if (reset) { + lock_slurmctld(job_read_lock); + /* USE READ LOCK, SEE ABOVE on first + * read lock */ + itr = list_iterator_create(job_list); + while ((job_ptr = list_next(itr))) { + if (job_ptr->db_index == NO_VAL) + job_ptr->db_index = 0; + } + list_iterator_destroy(itr); + unlock_slurmctld(job_read_lock); + } } running_db_inx = 0; diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 6f6bd64f8b1cafb45f2192dd4f4d4efff29c455a..46991d1e1a0d2a05f587048cf5df316b0ed5fc1a 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -813,7 +813,7 @@ static void _signal_while_allocating(int signo) { allocation_interrupted = true; if (pending_job_id != 0) { - slurm_complete_job(pending_job_id, 0); + slurm_complete_job(pending_job_id, NO_VAL); } } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 5429b0d79b0b0ada5268900830b2abf4b89fcdc1..89239068d1a97469373ca1774d32221010d63f14 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3137,12 +3137,14 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, if (IS_JOB_RUNNING(job_ptr)) job_comp_flag = JOB_COMPLETING; - else if (IS_JOB_PENDING(job_ptr)) + else if (IS_JOB_PENDING(job_ptr)) { + job_return_code = NO_VAL; job_ptr->start_time = now; + } if ((job_return_code == NO_VAL) && (IS_JOB_RUNNING(job_ptr) || IS_JOB_PENDING(job_ptr))) { - info("Job %u cancelled from srun", job_ptr->job_id); + info("Job %u cancelled from interactive user", job_ptr->job_id); } if (IS_JOB_SUSPENDED(job_ptr)) {