From a2fa37bb0253eb817c3e938d23fac1509d33578a Mon Sep 17 00:00:00 2001 From: David Bigagli <david@schedmd.com> Date: Mon, 18 Feb 2013 10:18:17 -0800 Subject: [PATCH] When no more job IDs available, return EAGAIN error to user The job submit commands will retry. Previously the slurmctld daemon generated a fatal error --- src/salloc/salloc.c | 4 ++-- src/slurmctld/job_mgr.c | 22 ++++++++++++++-------- src/srun/libsrun/allocate.c | 2 +- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 074d868f9cd..7b70e9e70bc 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -312,8 +312,8 @@ int main(int argc, char *argv[]) before = time(NULL); while ((alloc = slurm_allocate_resources_blocking(&desc, opt.immediate, _pending_callback)) == NULL) { - if ((errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) || - (retries >= MAX_RETRIES)) + if (((errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) && + (errno != EAGAIN)) || (retries >= MAX_RETRIES)) break; if (retries == 0) error("%s", msg); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 7ce3a53abe7..a52c155c488 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -184,7 +184,7 @@ static int _reset_detail_bitmaps(struct job_record *job_ptr); static void _reset_step_bitmaps(struct job_record *job_ptr); static int _resume_job_nodes(struct job_record *job_ptr, bool indf_susp); static void _send_job_kill(struct job_record *job_ptr); -static void _set_job_id(struct job_record *job_ptr); +static int _set_job_id(struct job_record *job_ptr); static void _signal_batch_job(struct job_record *job_ptr, uint16_t signal); static void _signal_job(struct job_record *job_ptr, int signal); static void _suspend_job(struct job_record *job_ptr, uint16_t op, @@ -2714,7 +2714,8 @@ struct job_record *_job_rec_copy(struct job_record *job_ptr) return job_ptr_new; /* Set job-specific ID and hash table */ - _set_job_id(job_ptr_new); + if (_set_job_id(job_ptr_new)) + fatal("job array create_job_record error"); _add_job_hash(job_ptr_new); /* Copy most of original job data. @@ -4986,10 +4987,13 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, job_ptr->partition = xstrdup(job_desc->partition); - if (job_desc->job_id != NO_VAL) /* already confirmed unique */ + if (job_desc->job_id != NO_VAL) { /* already confirmed unique */ job_ptr->job_id = job_desc->job_id; - else - _set_job_id(job_ptr); + } else { + error_code = _set_job_id(job_ptr); + if (error_code) + return error_code; + } if (job_desc->name) job_ptr->name = xstrdup(job_desc->name); @@ -6463,7 +6467,7 @@ extern uint32_t get_next_job_id(void) * _set_job_id - set a default job_id, insure that it is unique * IN job_ptr - pointer to the job_record */ -static void _set_job_id(struct job_record *job_ptr) +static int _set_job_id(struct job_record *job_ptr) { int i; uint32_t new_id; @@ -6480,12 +6484,14 @@ static void _set_job_id(struct job_record *job_ptr) new_id = job_id_sequence; if (find_job_record(new_id) == NULL) { job_ptr->job_id = new_id; - return; + return SLURM_SUCCESS; } } - fatal("We have exhausted our supply of valid job id values." + error("We have exhausted our supply of valid job id values. " "FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id, slurmctld_conf.max_job_id); + job_ptr->job_id = NO_VAL; + return EAGAIN; } diff --git a/src/srun/libsrun/allocate.c b/src/srun/libsrun/allocate.c index da4365953bb..08f6f286814 100644 --- a/src/srun/libsrun/allocate.c +++ b/src/srun/libsrun/allocate.c @@ -185,7 +185,7 @@ static bool _retry(void) static char *msg = "Slurm controller not responding, " "sleeping and retrying."; - if (errno == ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) { + if ((errno == ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) || (errno == EAGAIN)) { if (retries == 0) error("%s", msg); else if (retries < MAX_RETRIES) -- GitLab