From 1973ebef68c29425b950d7735aa60280fd2f48f5 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 17 Aug 2004 15:52:37 +0000 Subject: [PATCH] Srun cancels a created job if job step creation fails (rather than leaving it orphaned). --- NEWS | 8 +++++++- src/srun/allocate.c | 16 ++++++++++------ src/srun/allocate.h | 4 +++- src/srun/srun.c | 8 ++++++-- 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index ddd3a74591e..7df4cc54ab5 100644 --- a/NEWS +++ b/NEWS @@ -3,8 +3,14 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 0.4.0-pre2 ============================= + -- Fixes for reported problems: + - slurm/477: Signal of batch job script (scancel -b) fixed -- NOTE: "startclean" when transitioning from version 0.4.0-pre1, JOBS ARE LOST - -- Preserve job's requested processor count info after job is initiated + -- Preserve job's requested processor count info after job is initiated + (for viewing by squeue and scontrol) + -- Added a lots of Blue Gene/L support logic: slurmd executes on a single + node to front-end the 512-CPU base-partitions (Blue Gene/L's nodes) + -- srun cancels created job if job step creation fails * Changes in SLURM 0.4.0-pre1 ============================= diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 4d11e2ac676..d9819cecc5d 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -469,17 +469,20 @@ _step_req_destroy(job_step_create_request_msg_t *r) } } -void +int create_job_step(job_t *job) { job_step_create_request_msg_t *req = NULL; job_step_create_response_msg_t *resp = NULL; - if (!(req = _step_req_create(job))) - fatal ("Unable to allocate step request message"); - - if ((slurm_job_step_create(req, &resp) < 0) || (resp == NULL)) - fatal ("Unable to create job step: %m"); + if (!(req = _step_req_create(job))) { + error ("Unable to allocate step request message"); + return -1; + } + if ((slurm_job_step_create(req, &resp) < 0) || (resp == NULL)) { + error ("Unable to create job step: %m"); + return -1; + } job->stepid = resp->job_step_id; job->cred = resp->cred; @@ -490,5 +493,6 @@ create_job_step(job_t *job) job_update_io_fnames(job); _step_req_destroy(req); + return 0; } diff --git a/src/srun/allocate.h b/src/srun/allocate.h index 2eef790161a..d92ebc81108 100644 --- a/src/srun/allocate.h +++ b/src/srun/allocate.h @@ -74,8 +74,10 @@ uint32_t jobid_from_env(void); /* * Create a job step given the job information stored in 'j' * After returning, 'j' is filled in with information for job step. + * + * Returns -1 if job step creation failure, 0 otherwise */ -void create_job_step(job_t *j); +int create_job_step(job_t *j); #endif /* !_HAVE_ALLOCATE_H */ diff --git a/src/srun/srun.c b/src/srun/srun.c index 804c9ed3ade..530234bc22d 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -155,7 +155,8 @@ int srun(int ac, char **av) job = job_create_allocation(resp); job->old_job = true; sig_setup_sigmask(); - create_job_step(job); + if (create_job_step(job) < 0) + exit(1); slurm_free_resource_allocation_response_msg(resp); } else if (opt.allocate) { @@ -191,7 +192,10 @@ int srun(int ac, char **av) _print_job_information(resp); job = job_create_allocation(resp); - create_job_step(job); + if (create_job_step(job) < 0) { + job_destroy(job, 0); + exit(1); + } slurm_free_resource_allocation_response_msg(resp); } -- GitLab