From 250a02121d2080af92f508226be3169ffd3bd742 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 5 Dec 2002 19:34:10 +0000
Subject: [PATCH] Add flag to job when initiated with
 allocated_resources_and_run_job_step(). If set, the job step termination is
 treated like job termination (i.e. the nodes are released and the job state
 set to complete).

---
 src/slurmctld/controller.c |  8 +++++---
 src/slurmctld/slurmctld.h  | 11 +++++++++--
 src/slurmctld/step_mgr.c   | 13 +++++++++++--
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 6526f471d4d..8b78a11487f 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -1026,6 +1026,7 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg)
 	debug("Processing RPC: REQUEST_COMPLETE_JOB_STEP");
 	uid = slurm_auth_uid(msg->cred);
 	lock_slurmctld(job_write_lock);
+
 	/* do RPC call */
 	/* First set node down as needed on fatal error */
 	if (complete_job_step_msg->slurm_rc != SLURM_SUCCESS) {
@@ -1077,7 +1078,8 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg)
 		error_code =
 		    job_step_complete(complete_job_step_msg->job_id,
 				      complete_job_step_msg->job_step_id,
-				      uid);
+				      uid, job_requeue,
+				      complete_job_step_msg->job_rc);
 		unlock_slurmctld(job_write_lock);
 
 		/* return result */
@@ -1502,7 +1504,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 	req_step_msg.node_count = INFINITE;
 	req_step_msg.cpu_count  = job_desc_msg->num_procs;
 	req_step_msg.num_tasks  = job_desc_msg->num_tasks;
-	error_code = step_create(&req_step_msg, &step_rec);
+	error_code = step_create(&req_step_msg, &step_rec, true);
 	/* note: no need to free step_rec, pointer to global job step record */
 	if (error_code) {
 		unlock_slurmctld(job_write_lock);
@@ -1841,7 +1843,7 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg)
 	if (error_code == 0) {
 		/* issue the RPC */
 		lock_slurmctld(job_write_lock);
-		error_code = step_create(req_step_msg, &step_rec);
+		error_code = step_create(req_step_msg, &step_rec, false);
 	}
 
 	/* return result */
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index fcaf15b2bfc..92be49c958e 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -224,6 +224,8 @@ struct job_record {
 	enum job_states job_state;	/* state of the job */
 	uint16_t kill_on_node_fail;	/* 1 if job should be killed on 
 					   node failure */
+	uint16_t kill_on_step_done;	/* 1 if job should be killed when 
+					   the job step completes */
 	char *nodes;			/*  list of nodes allocated to job */
 	bitstr_t *node_bitmap;		/* bitmap of nodes allocated to job */
 	uint32_t time_limit;		/* time_limit minutes or INFINITE */
@@ -597,12 +599,14 @@ extern int job_complete (uint32_t job_id, uid_t uid, bool requeue,
  * IN job_id - id of the job to be completed
  * IN step_id - id of the job step to be completed
  * IN uid - user id of user issuing the RPC
+ * IN requeue - job should be run again if possible
+ * IN job_return_code - job's return code, if set then set state to JOB_FAILED
  * RET 0 on success, otherwise ESLURM error code 
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int job_step_complete (uint32_t job_id, uint32_t job_step_id, 
-			uid_t uid);
+			uid_t uid, bool requeue, uint32_t job_return_code);
 
 /* 
  * job_time_limit - terminate jobs which have exceeded their time limit
@@ -853,11 +857,14 @@ extern void set_slurmd_addr (void);
  *	accoding to the step_specs.
  * IN step_specs - job step specifications
  * OUT new_step_record - pointer to the new step_record (NULL on error)
+ * IN kill_job_when_step_done - if set kill the job on step completion
  * RET - 0 or error code
  * NOTE: don't free the returned step_record because that is managed through
  * 	the job.
  */
-extern int step_create ( step_specs *step_specs, struct step_record** );
+extern int step_create ( step_specs *step_specs, 
+			 struct step_record** new_step_record,
+			 bool kill_job_when_step_done );
 
 /*
  * update_job - update a job's parameters per the supplied specifications
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 09d4f35f9c4..7460c4dd93a 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -249,11 +249,14 @@ int job_step_cancel(uint32_t job_id, uint32_t step_id, uid_t uid)
  * IN job_id - id of the job to be completed
  * IN step_id - id of the job step to be completed
  * IN uid - user id of user issuing the RPC
+ * IN requeue - job should be run again if possible
+ * IN job_return_code - job's return code, if set then set state to JOB_FAILED
  * RET 0 on success, otherwise ESLURM error code 
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
-int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid)
+int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid,
+		      bool requeue, uint32_t job_return_code)
 {
 	struct job_record *job_ptr;
 	int error_code;
@@ -264,6 +267,9 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid)
 		return ESLURM_INVALID_JOB_ID;
 	}
 
+	if (job_ptr->kill_on_step_done)
+		return job_complete(job_id, uid, requeue, job_return_code);
+
 	if ((job_ptr->job_state == JOB_FAILED) ||
 	    (job_ptr->job_state == JOB_COMPLETE) ||
 	    (job_ptr->job_state == JOB_TIMEOUT))
@@ -413,12 +419,14 @@ cleanup:
  *	according to the step_specs.
  * IN step_specs - job step specifications
  * OUT new_step_record - pointer to the new step_record (NULL on error)
+ * IN kill_job_when_step_done - if set kill the job on step completion
  * RET - 0 or error code
  * NOTE: don't free the returned step_record because that is managed through
  * 	the job.
  */
 int
-step_create ( step_specs *step_specs, struct step_record** new_step_record  )
+step_create ( step_specs *step_specs, struct step_record** new_step_record,
+	      bool kill_job_when_step_done )
 {
 	struct step_record *step_ptr;
 	struct job_record  *job_ptr;
@@ -450,6 +458,7 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record  )
 #endif
 
 	job_ptr->time_last_active = time(NULL);
+	job_ptr->kill_on_step_done = kill_job_when_step_done;
 	nodeset = _pick_step_nodes (job_ptr, step_specs);
 	if (nodeset == NULL)
 		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ;
-- 
GitLab