From 5d9fa3f6515d67d6ee9755f3a21f5aae97377358 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 3 Oct 2003 20:59:14 +0000
Subject: [PATCH] Resend KILL_JOB message if a completing job hangs. This can
 recover from the lost of an EPILOG_COMPLETE message.

---
 src/slurmctld/job_mgr.c        | 12 +++--
 src/slurmctld/node_scheduler.c | 93 +++++++++++++++++++++++++++++++---
 src/slurmctld/slurmctld.h      | 13 ++++-
 3 files changed, 104 insertions(+), 14 deletions(-)

diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index e175a7cf47a..faf88e2374b 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -2159,11 +2159,15 @@ static int _list_find_job_old(void *job_entry, void *key)
 	if (job_ptr->end_time > min_age)
 		return 0;	/* Too new to purge */
 
-	if ((!(IS_JOB_FINISHED(job_ptr))) ||
-	    (job_ptr->job_state & JOB_COMPLETING))
-		return 0;	/* Still active, can't purge */
+	if (!(IS_JOB_FINISHED(job_ptr))) 
+		return 0;	/* Job still active */
 
-	return 1;
+	if (job_ptr->job_state & JOB_COMPLETING) {
+		re_kill_job(job_ptr);
+		return 0;	/* Job still completing */
+	}
+
+	return 1;		/* Purge the job */
 }
 
 
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 820f4ec0b4e..c40e03437ca 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -159,12 +159,11 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout)
 	kill_job->job_uid = job_ptr->user_id;
 
 	for (i = 0; i < node_record_count; i++) {
+		struct node_record *node_ptr = &node_record_table_ptr[i];
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
-		base_state = node_record_table_ptr[i].node_state &
-				(~NODE_STATE_NO_RESPOND);
-		no_resp_flag = node_record_table_ptr[i].node_state &
-				NODE_STATE_NO_RESPOND;
+		base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND);
+		no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
 		if ((base_state == NODE_STATE_DOWN) || no_resp_flag) {
 			/* Issue the KILL RPC, but don't verify response */
 			down_node_cnt++;
@@ -180,12 +179,12 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout)
 				 (MAX_NAME_LEN * buf_rec_size));
 		}
 		agent_args->slurm_addr[agent_args->node_count] =
-		    node_record_table_ptr[i].slurm_addr;
+		    node_ptr->slurm_addr;
 		strncpy(&agent_args->
 			node_names[MAX_NAME_LEN * agent_args->node_count],
-			node_record_table_ptr[i].name, MAX_NAME_LEN);
+			node_ptr->name, MAX_NAME_LEN);
 		agent_args->node_count++;
-		make_node_comp(&node_record_table_ptr[i]);
+		make_node_comp(node_ptr);
 	}
 
 	if ((agent_args->node_count - down_node_cnt) == 0)
@@ -193,6 +192,7 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout)
 	if (agent_args->node_count == 0) {
 		error("Job %u allocated no nodes to be killed on",
 		      job_ptr->job_id);
+		xfree(kill_job);
 		xfree(agent_args);
 		return;
 	}
@@ -218,7 +218,6 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout)
 	return;
 }
 
-
 /*
  * _match_feature - determine if the desired feature is one of those available
  * IN seek - desired feature
@@ -1182,3 +1181,81 @@ static int _valid_features(char *requested, char *available)
 	xfree(tmp_requested);
 	return result;
 }
+
+/*
+ * re_kill_job - for a given job, deallocate its nodes for a second time, 
+ *	basically a cleanup for failed deallocate() calls
+ * IN job_ptr - pointer to terminating job (already in some COMPLETING state)
+ * globals: node_record_count - number of nodes in the system
+ *	node_record_table_ptr - pointer to global node table
+ */
+extern void re_kill_job(struct job_record *job_ptr)
+{
+	int i, retries = 0;
+	kill_job_msg_t *kill_job;
+	agent_arg_t *agent_args;
+	pthread_attr_t attr_agent;
+	pthread_t thread_agent;
+	int buf_rec_size = 0;
+
+	xassert(job_ptr);
+	xassert(job_ptr->details);
+
+	agent_args = xmalloc(sizeof(agent_arg_t));
+	agent_args->msg_type = REQUEST_KILL_JOB;
+	agent_args->retry = 0;
+	kill_job = xmalloc(sizeof(kill_job_msg_t));
+	last_node_update = time(NULL);
+	kill_job->job_id = job_ptr->job_id;
+	kill_job->job_uid = job_ptr->user_id;
+
+	for (i = 0; i < node_record_count; i++) {
+		struct node_record *node_ptr = &node_record_table_ptr[i];
+		if (bit_test(job_ptr->node_bitmap, i) == 0)
+			continue;
+		if (node_ptr->node_state & NODE_STATE_NO_RESPOND)
+			continue;
+		info("Resending KILL_JOB request for JobId=%u, Node=%s",
+			job_ptr->job_id, node_ptr->name);
+		if ((agent_args->node_count + 1) > buf_rec_size) {
+			buf_rec_size += 32;
+			xrealloc((agent_args->slurm_addr),
+				 (sizeof(struct sockaddr_in) *
+				  buf_rec_size));
+			xrealloc((agent_args->node_names),
+				 (MAX_NAME_LEN * buf_rec_size));
+		}
+		agent_args->slurm_addr[agent_args->node_count] =
+		    node_ptr->slurm_addr;
+		strncpy(&agent_args->
+			node_names[MAX_NAME_LEN * agent_args->node_count],
+			node_ptr->name, MAX_NAME_LEN);
+		agent_args->node_count++;
+	}
+
+	if (agent_args->node_count == 0) {
+		xfree(kill_job);
+		xfree(agent_args);
+		return;
+	}
+
+	agent_args->msg_args = kill_job;
+	debug2("Spawning job kill agent");
+	if (pthread_attr_init(&attr_agent))
+		fatal("pthread_attr_init error %m");
+	if (pthread_attr_setdetachstate
+	    (&attr_agent, PTHREAD_CREATE_DETACHED))
+		error("pthread_attr_setdetachstate error %m");
+#ifdef PTHREAD_SCOPE_SYSTEM
+	if (pthread_attr_setscope(&attr_agent, PTHREAD_SCOPE_SYSTEM))
+		error("pthread_attr_setscope error %m");
+#endif
+	while (pthread_create(&thread_agent, &attr_agent, agent, 
+			(void *) agent_args)) {
+		error("pthread_create error %m");
+		if (++retries > MAX_RETRIES)
+			fatal("Can't create pthread");
+		sleep(1);	/* sleep and try again */
+	}
+	return;
+}
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 39f2024e0a4..62a6f14ee48 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -888,11 +888,20 @@ void part_fini (void);
  * global: job_list - global job table
  *	last_job_update - time of last job table update
  */
-void purge_old_job (void);
+extern void purge_old_job (void);
+
+/*
+ * re_kill_job - for a given job, deallocate its nodes for a second time, 
+ *      basically a cleanup for failed deallocate() calls
+ * IN job_ptr - pointer to terminating job (already in some COMPLETING state)
+ * globals: node_record_count - number of nodes in the system
+ *      node_record_table_ptr - pointer to global node table
+ */
+extern void re_kill_job(struct job_record *job_ptr);
 
 /* rehash_jobs - Create or rebuild the job rehash table. Actually for now we 
  * just preserve it */
-void rehash_jobs(void);
+extern void rehash_jobs(void);
 
 /* 
  * rehash_node - build a hash table of the node_record entries. this is a large 
-- 
GitLab