From 25227d02af5919c7f90ee626ceff2c70d2fd013d Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 20 Dec 2004 17:28:45 +0000
Subject: [PATCH] Fix how job kill RPC is handled, include node_select data if
 available,  don't seg fault if data is missing, free the data when the agent
 is  finished with it.

---
 src/common/node_select.c         | 18 ++++++++++--------
 src/common/slurm_protocol_defs.h |  3 ++-
 src/slurmctld/agent.c            |  3 +++
 src/slurmctld/job_mgr.c          | 19 +++++++++++++------
 src/slurmctld/node_mgr.c         |  8 ++++----
 src/slurmctld/slurmctld.h        |  2 ++
 6 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/src/common/node_select.c b/src/common/node_select.c
index 24380651d7e..92267a6c65f 100644
--- a/src/common/node_select.c
+++ b/src/common/node_select.c
@@ -10,7 +10,7 @@
  *****************************************************************************
  *  Copyright (C) 2002 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Morris Jette <jette@llnl.gov>.
+ *  Written by Morris Jette <jette1@llnl.gov>.
  *  UCRL-CODE-2002-040.
  *  
  *  This file is part of SLURM, a resource management program.
@@ -409,7 +409,7 @@ extern int select_g_set_jobinfo (select_jobinfo_t jobinfo,
 	char * tmp_char = (char *) data;
 
 	if (jobinfo->magic != JOBINFO_MAGIC) {
-		error("select_p_set_jobinfo: jobinfo magic bad");
+		error("select_g_set_jobinfo: jobinfo magic bad");
 		return SLURM_ERROR;
 	}
 
@@ -452,7 +452,7 @@ extern int select_g_get_jobinfo (select_jobinfo_t jobinfo,
 	char **tmp_char = (char **) data;
 
 	if (jobinfo->magic != JOBINFO_MAGIC) {
-		error("select_p_set_jobinfo: jobinfo magic bad");
+		error("select_g_get_jobinfo: jobinfo magic bad");
 		return SLURM_ERROR;
 	}
 
@@ -494,8 +494,10 @@ extern select_jobinfo_t select_g_copy_jobinfo(select_jobinfo_t jobinfo)
 {
 	struct select_jobinfo *rc = NULL;
 
-	if (jobinfo->magic != JOBINFO_MAGIC)
-		error("select_p_copy_jobinfo: jobinfo magic bad");
+	if (jobinfo == NULL)
+		;
+	else if (jobinfo->magic != JOBINFO_MAGIC)
+		error("select_g_copy_jobinfo: jobinfo magic bad");
 	else {
 		int i;
 		rc = xmalloc(sizeof(struct select_jobinfo));
@@ -523,7 +525,7 @@ extern int select_g_free_jobinfo  (select_jobinfo_t *jobinfo)
 	if (*jobinfo == NULL)	/* never set, treat as not an error */
 		;
 	else if ((*jobinfo)->magic != JOBINFO_MAGIC) {
-		error("select_p_set_jobinfo: jobinfo magic bad");
+		error("select_g_free_jobinfo: jobinfo magic bad");
 		rc = EINVAL;
 	} else {
 		(*jobinfo)->magic = 0;
@@ -595,13 +597,13 @@ extern char *select_g_sprint_jobinfo(select_jobinfo_t jobinfo,
 	int i;
 
 	if (buf == NULL) {
-		error("select_p_sprint_jobinfo: buf is null");
+		error("select_g_sprint_jobinfo: buf is null");
 		return NULL;
 	}
 
 	if ((mode != SELECT_PRINT_DATA)
 	&& jobinfo && (jobinfo->magic != JOBINFO_MAGIC)) {
-		error("select_p_copy_jobinfo: jobinfo magic bad");
+		error("select_g_sprint_jobinfo: jobinfo magic bad");
 		return NULL;
 	}
 
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index 480ceb211cd..7cae18d1a95 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -319,7 +319,8 @@ typedef struct return_code_msg {
 /* Note: We include select_jobinfo here in addition to the job launch 
  * RPC in order to insure reliable clean-up of a BlueGene partition in
  * the event of some launch failure or race condition preventing slurmd 
- * from getting the BGL_PARTITION_ID at that time */
+ * from getting the BGL_PARTITION_ID at that time. It is needed for 
+ * the job epilog. */
 typedef struct kill_job_msg {
 	uint32_t job_id;
 	uint32_t job_uid;
diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 7e34c9605e8..9de112fe525 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -935,6 +935,9 @@ static void _purge_agent_args(agent_arg_t *agent_arg_ptr)
 				RESPONSE_RESOURCE_ALLOCATION)
 			slurm_free_resource_allocation_response_msg(
 					agent_arg_ptr->msg_args);
+		else if (agent_arg_ptr->msg_type ==
+				REQUEST_KILL_JOB)
+			slurm_free_kill_job_msg(agent_arg_ptr->msg_args);
 		else
 			xfree(agent_arg_ptr->msg_args);
 	}
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 8c6a74885ce..1ca40dd9573 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -3177,7 +3177,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 		if (job_ptr == NULL) {
 			error("Orphan job %u.%u reported on node %s",
 			      job_id_ptr[i], step_id_ptr[i], node_name);
-			kill_job_on_node(job_id_ptr[i], node_ptr);
+			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 		}
 
 		else if (job_ptr->job_state == JOB_RUNNING) {
@@ -3196,14 +3196,15 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 				error
 				    ("Registered job %u.%u on wrong node %s ",
 				     job_id_ptr[i], step_id_ptr[i], node_name);
-				kill_job_on_node(job_id_ptr[i], node_ptr);
+				kill_job_on_node(job_id_ptr[i], job_ptr, 
+						node_ptr);
 			}
 		}
 
 		else if (job_ptr->job_state & JOB_COMPLETING) {
 			/* Re-send kill request as needed, 
 			 * not necessarily an error */
-			kill_job_on_node(job_id_ptr[i], node_ptr);
+			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 		}
 
 
@@ -3215,7 +3216,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 			last_job_update    = now;
 			job_ptr->start_time = job_ptr->end_time  = now;
 			delete_job_details(job_ptr);
-			kill_job_on_node(job_id_ptr[i], node_ptr);
+			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 			job_completion_logger(job_ptr);
 		}
 
@@ -3225,7 +3226,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 			     job_id_ptr[i], step_id_ptr[i], 
 			     job_state_string(job_ptr->job_state),
 			     node_name);
-			kill_job_on_node(job_id_ptr[i], node_ptr);
+			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 		}
 	}
 
@@ -3277,10 +3278,12 @@ static void _purge_lost_batch_jobs(int node_inx, time_t now)
  *	these jobs and use this function to kill them - one 
  *	agent request per node as they register.
  * IN job_id - id of the job to be killed
+ * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
  * IN node_ptr - pointer to the node on which the job resides
  */
 extern void
-kill_job_on_node(uint32_t job_id, struct node_record *node_ptr)
+kill_job_on_node(uint32_t job_id, struct job_record *job_ptr, 
+		struct node_record *node_ptr)
 {
 	agent_arg_t *agent_info;
 	kill_job_msg_t *kill_req;
@@ -3289,6 +3292,10 @@ kill_job_on_node(uint32_t job_id, struct node_record *node_ptr)
 
 	kill_req = xmalloc(sizeof(kill_job_msg_t));
 	kill_req->job_id	= job_id;
+	if (job_ptr) {  /* NULL if unknown */
+		kill_req->select_jobinfo = select_g_copy_jobinfo(
+			job_ptr->select_jobinfo);
+	}
 
 	agent_info = xmalloc(sizeof(agent_arg_t));
 	agent_info->node_count	= 1;
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index 04a31d29bdc..30e54b5b325 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -1182,7 +1182,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 		if (job_ptr == NULL) {
 			error("Orphan job %u.%u reported",
 			      job_id_ptr[i], step_id_ptr[i]);
-			kill_job_on_node(job_id_ptr[i], node_ptr);
+			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 		}
 
 		else if (job_ptr->job_state == JOB_RUNNING) {
@@ -1197,7 +1197,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 		else if (job_ptr->job_state & JOB_COMPLETING) {
 			/* Re-send kill request as needed, 
 			 * not necessarily an error */
-			kill_job_on_node(job_id_ptr[i], node_ptr);
+			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 		}
 
 
@@ -1209,7 +1209,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 			last_job_update    = now;
 			job_ptr->start_time = job_ptr->end_time  = now;
 			delete_job_details(job_ptr);
-			kill_job_on_node(job_id_ptr[i], node_ptr);
+			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 			job_completion_logger(job_ptr);
 		}
 
@@ -1217,7 +1217,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 			error("Registered job %u.%u in state %s",
 				job_id_ptr[i], step_id_ptr[i], 
 				job_state_string(job_ptr->job_state));
-			kill_job_on_node(job_id_ptr[i], node_ptr);
+			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 		}
 	}
 
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 9a23b1082c8..e4b3f0483b1 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -736,9 +736,11 @@ extern int kill_job_by_part_name(char *part_name);
  *	these jobs and use this function to kill them - one 
  *	agent request per node as they register.
  * IN job_id - id of the job to be killed
+ * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
  * IN node_ptr - pointer to the node on which the job resides
  */
 extern void kill_job_on_node(uint32_t job_id, 
+		struct job_record *job_ptr,
 		struct node_record *node_ptr);
 
 /*
-- 
GitLab