From e5cef28bfa4b646a91c8cd1a465e6d6c7e949f2f Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 16 Oct 2002 23:02:41 +0000 Subject: [PATCH] Added new error code ESLURM_JOB_PENDING if allocation check RPC finds job is still not running. srun to wait for job initiation as needed with polling. --- src/srun/srun.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/srun/srun.c b/src/srun/srun.c index ea8f7140fd8..d9c60194b9c 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -274,6 +274,7 @@ allocate_nodes(void) int rc, retries; job_desc_msg_t job; resource_allocation_response_msg_t *resp; + old_job_alloc_msg_t old_job; slurm_init_job_desc_msg(&job); @@ -325,9 +326,23 @@ allocate_nodes(void) } } - if (resp->node_list == NULL) { - info("No nodes allocated. exiting"); - return NULL; + if ((rc == 0) && (resp->node_list == NULL)) { + if (_verbose || _debug) + info ("Job %u queued and waiting for resources", resp->job_id); + old_job.job_id = resp->job_id; + old_job.uid = (uint32_t) getuid(); + slurm_free_resource_allocation_response_msg (resp); + sleep (2); + /* Keep polling until the job is allocated resources */ + while (slurm_confirm_allocation(&old_job, &resp) == SLURM_FAILURE) { + if (slurm_get_errno() == ESLURM_JOB_PENDING) + sleep (10); + else { + error("Unable to confirm resource allocation for job %u: %s", + old_job.job_id, slurm_strerror(errno)); + exit (1); + } + } } return resp; -- GitLab