From 1973ebef68c29425b950d7735aa60280fd2f48f5 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 17 Aug 2004 15:52:37 +0000
Subject: [PATCH] Srun cancels a created job if job step creation fails (rather
 than leaving it orphaned).

---
 NEWS                |  8 +++++++-
 src/srun/allocate.c | 16 ++++++++++------
 src/srun/allocate.h |  4 +++-
 src/srun/srun.c     |  8 ++++++--
 4 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/NEWS b/NEWS
index ddd3a74591e..7df4cc54ab5 100644
--- a/NEWS
+++ b/NEWS
@@ -3,8 +3,14 @@ documents those changes that are of interest to users and admins.
 
 * Changes in SLURM 0.4.0-pre2
 =============================
+ -- Fixes for reported problems:
+   - slurm/477: Signal of batch job script (scancel -b) fixed
  -- NOTE: "startclean" when transitioning from version 0.4.0-pre1, JOBS ARE LOST
- -- Preserve job's requested processor count info after job is initiated
+ -- Preserve job's requested processor count info after job is initiated 
+    (for viewing by squeue and scontrol)
+ -- Added a lots of Blue Gene/L support logic: slurmd executes on a single 
+    node to front-end the 512-CPU base-partitions (Blue Gene/L's nodes)
+ -- srun cancels created job if job step creation fails
 
 * Changes in SLURM 0.4.0-pre1
 =============================
diff --git a/src/srun/allocate.c b/src/srun/allocate.c
index 4d11e2ac676..d9819cecc5d 100644
--- a/src/srun/allocate.c
+++ b/src/srun/allocate.c
@@ -469,17 +469,20 @@ _step_req_destroy(job_step_create_request_msg_t *r)
 	}
 }
 
-void
+int
 create_job_step(job_t *job)
 {
 	job_step_create_request_msg_t  *req  = NULL;
 	job_step_create_response_msg_t *resp = NULL;
 
-	if (!(req = _step_req_create(job))) 
-		fatal ("Unable to allocate step request message");
-
-	if ((slurm_job_step_create(req, &resp) < 0) || (resp == NULL)) 
-		fatal ("Unable to create job step: %m");
+	if (!(req = _step_req_create(job))) { 
+		error ("Unable to allocate step request message");
+		return -1;
+	}
+	if ((slurm_job_step_create(req, &resp) < 0) || (resp == NULL)) { 
+		error ("Unable to create job step: %m");
+		return -1;
+	}
 
 	job->stepid  = resp->job_step_id;
 	job->cred    = resp->cred;
@@ -490,5 +493,6 @@ create_job_step(job_t *job)
 	job_update_io_fnames(job);
 
 	_step_req_destroy(req);
+	return 0;
 }
 
diff --git a/src/srun/allocate.h b/src/srun/allocate.h
index 2eef790161a..d92ebc81108 100644
--- a/src/srun/allocate.h
+++ b/src/srun/allocate.h
@@ -74,8 +74,10 @@ uint32_t jobid_from_env(void);
 /*
  * Create a job step given the job information stored in 'j'
  * After returning, 'j' is filled in with information for job step.
+ *
+ * Returns -1 if job step creation failure, 0 otherwise
  */
-void create_job_step(job_t *j);
+int create_job_step(job_t *j);
 
 
 #endif /* !_HAVE_ALLOCATE_H */
diff --git a/src/srun/srun.c b/src/srun/srun.c
index 804c9ed3ade..530234bc22d 100644
--- a/src/srun/srun.c
+++ b/src/srun/srun.c
@@ -155,7 +155,8 @@ int srun(int ac, char **av)
 		job = job_create_allocation(resp); 
 		job->old_job = true;
 		sig_setup_sigmask();
-		create_job_step(job);
+		if (create_job_step(job) < 0)
+			exit(1);
 		slurm_free_resource_allocation_response_msg(resp);
 
 	} else if (opt.allocate) {
@@ -191,7 +192,10 @@ int srun(int ac, char **av)
 			_print_job_information(resp);
 
 		job = job_create_allocation(resp); 
-		create_job_step(job);
+		if (create_job_step(job) < 0) {
+			job_destroy(job, 0);
+			exit(1);
+		}
 		slurm_free_resource_allocation_response_msg(resp);
 	}
 
-- 
GitLab