From e6d85f72d9e7d628e8ed2aaca3b93cb88ab884dd Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 23 Jan 2003 18:18:13 +0000
Subject: [PATCH] Job signal functions properly. All steps signalled, if
 SIGKILL the allocation is release only upon termination of the last step.

---
 src/slurmctld/controller.c |  3 ++-
 src/slurmctld/job_mgr.c    | 50 ++++++++++++++++++++++++++------------
 src/slurmctld/slurmctld.h  | 14 ++++++++---
 src/slurmctld/step_mgr.c   | 20 ++++++++++-----
 4 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 5a87e746ebe..d601ffb95c9 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -940,7 +940,8 @@ static void _slurm_rpc_job_step_kill(slurm_msg_t * msg)
 
 	/* do RPC call */
 	if (job_step_kill_msg->job_step_id == NO_VAL) {
-		error_code = job_cancel(job_step_kill_msg->job_id, uid);
+		error_code = job_signal(job_step_kill_msg->job_id, 
+					job_step_kill_msg->signal, uid);
 		unlock_slurmctld(job_write_lock);
 
 		/* return result */
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index b105322aa10..a9635115b27 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -1101,20 +1101,21 @@ int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id,
 
 
 /* 
- * job_cancel - cancel the specified job
- * IN job_id - id of the job to be cancelled
+ * job_signal - signal the specified job
+ * IN job_id - id of the job to be signaled
+ * IN signal - signal to send, SIGKILL == cancel the job
  * IN uid - uid of requesting user
  * RET 0 on success, otherwise ESLURM error code 
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
-int job_cancel(uint32_t job_id, uid_t uid)
+int job_signal(uint32_t job_id, uint16_t signal, uid_t uid)
 {
 	struct job_record *job_ptr;
 
 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
-		info("job_cancel: invalid job id %u", job_id);
+		info("job_signal: invalid job id %u", job_id);
 		return ESLURM_INVALID_JOB_ID;
 	}
 
@@ -1129,28 +1130,47 @@ int job_cancel(uint32_t job_id, uid_t uid)
 		return ESLURM_USER_ID_MISSING;
 	}
 
-	if (job_ptr->job_state == JOB_PENDING) {
+	if ((job_ptr->job_state == JOB_PENDING) &&
+	    (signal == SIGKILL)) {
 		last_job_update = time(NULL);
 		job_ptr->job_state = JOB_FAILED;
 		job_ptr->start_time = job_ptr->end_time = time(NULL);
 		delete_job_details(job_ptr);
-		verbose("job_cancel of pending job %u successful", job_id);
+		verbose("job_signal of pending job %u successful", job_id);
 		return SLURM_SUCCESS;
 	}
 
 	if (job_ptr->job_state == JOB_RUNNING) {
-		last_job_update = time(NULL);
-		job_ptr->job_state = JOB_FAILED;
-		job_ptr->end_time = time(NULL);
-		deallocate_nodes(job_ptr);
-		delete_all_step_records(job_ptr);
-		delete_job_details(job_ptr);
-		verbose("job_cancel of running job %u successful", job_id);
+		ListIterator step_record_iterator;
+		struct step_record *step_ptr;
+		int step_cnt = 0;
+
+		step_record_iterator = 
+				list_iterator_create (job_ptr->step_list);		
+		while ((step_ptr = (struct step_record *)
+					list_next (step_record_iterator))) {
+			signal_step_tasks(step_ptr, signal);
+			step_cnt++;
+		}
+		list_iterator_destroy (step_record_iterator);
+
+		if (signal == SIGKILL) {
+			job_ptr->kill_on_step_done = 1;
+			last_job_update = time(NULL);
+		}
+		if ((signal == SIGKILL) && (step_cnt == 0)) {
+			/* kill job with no active steps */
+			job_ptr->job_state = JOB_COMPLETE;
+			job_ptr->end_time = time(NULL);
+			deallocate_nodes(job_ptr);
+			delete_job_details(job_ptr);
+		}
+		verbose("job_signal of running job %u successful", job_id);
 		return SLURM_SUCCESS;
 	}
 
-	verbose("job_cancel: job %u can't be cancelled from state=%s",
-		job_id, job_state_string(job_ptr->job_state));
+	verbose("job_signal: job %u can't be sent signal %u from state=%s",
+		job_id, signal, job_state_string(job_ptr->job_state));
 	return ESLURM_TRANSITION_STATE_NO_UPDATE;
 }
 
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 3e190b2af36..27b78407c64 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -580,14 +580,15 @@ extern int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id,
 	     uint16_t * node_cnt, slurm_addr ** node_addr);
 
 /* 
- * job_cancel - cancel the specified job
- * IN job_id - id of the job to be cancelled
+ * job_signal - signal the specified job
+ * IN job_id - id of the job to be signaled
+ * IN signal - signal to send, SIGKILL == cancel the job
  * IN uid - uid of requesting user
  * RET 0 on success, otherwise ESLURM error code 
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
-extern int job_cancel (uint32_t job_id, uid_t uid);
+extern int job_signal(uint32_t job_id, uint16_t signal, uid_t uid);
 
 /* 
  * job_step_cancel - cancel the specified job step
@@ -886,6 +887,13 @@ extern int set_batch_job_sid(uid_t uid, uint32_t job_id, uint32_t batch_sid);
  *	Uses common data structures. */
 extern void set_slurmd_addr (void);
 
+/*
+ * signal_step_tasks - send specific signal to specific job step
+ * IN step_ptr - step record pointer
+ * IN signal - signal to send
+ */
+extern void signal_step_tasks(struct step_record *step_ptr, uint16_t signal);
+
 /*
  * step_create - creates a step_record in step_specs->job_id, sets up the
  *	accoding to the step_specs.
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 6d94444814d..c722f26f5cb 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -51,8 +51,6 @@
 static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer);
 static bitstr_t * _pick_step_nodes (struct job_record  *job_ptr, 
 				    step_specs *step_spec );
-static void _signal_step_tasks(struct step_record *step_ptr, uint16_t signal);
-
 /* 
  * create_step_record - create an empty step_record for the specified job.
  * IN job_ptr - pointer to job table entry to have step record added
@@ -235,12 +233,17 @@ int job_step_signal(uint32_t job_id, uint32_t step_id,
 		return ESLURM_ALREADY_DONE;
 	}
 
-	_signal_step_tasks(step_ptr, signal);
+	signal_step_tasks(step_ptr, signal);
 	return SLURM_SUCCESS;
 
 }
 
-static void _signal_step_tasks(struct step_record *step_ptr, uint16_t signal)
+/*
+ * signal_step_tasks - send specific signal to specific job step
+ * IN step_ptr - step record pointer
+ * IN signal - signal to send
+ */
+void signal_step_tasks(struct step_record *step_ptr, uint16_t signal)
 {
 	int i;
 	kill_tasks_msg_t *kill_tasks_msg;
@@ -329,7 +332,8 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid,
 		return ESLURM_INVALID_JOB_ID;
 	}
 
-	if (job_ptr->kill_on_step_done)
+	if ((job_ptr->kill_on_step_done) &&
+	    (list_count(job_ptr->step_list) <= 1))
 		return job_complete(job_id, uid, requeue, job_return_code);
 
 	if ((job_ptr->job_state == JOB_FAILED) ||
@@ -517,8 +521,12 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record,
 		return ESLURM_BAD_DIST;
 #endif
 
-	job_ptr->time_last_active = time(NULL);
+	if (job_ptr->kill_on_step_done)
+		/* Don't start more steps, job already being cancelled */
+		return ESLURM_ALREADY_DONE;
 	job_ptr->kill_on_step_done = kill_job_when_step_done;
+
+	job_ptr->time_last_active = time(NULL);
 	nodeset = _pick_step_nodes (job_ptr, step_specs);
 	if (nodeset == NULL)
 		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ;
-- 
GitLab