From f828b775dcffb59cd79d80af9d66be405909079b Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Sat, 28 Dec 2002 00:41:12 +0000
Subject: [PATCH] If a slurmd fails, let job allocated to it continue execution
 of so configured.

---
 src/slurmctld/job_mgr.c        | 28 ++++++++++++++++++++++++++--
 src/slurmctld/node_scheduler.c |  4 ++--
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 3d6f1589511..6693441e6eb 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -114,6 +114,8 @@ static void _dump_job_details_state(struct job_details *detail_ptr,
 				    Buf buffer);
 static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer);
 static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer);
+static void _excise_node_from_job(struct job_record *job_record_ptr, 
+				  struct node_record *node_record_ptr);
 static int  _find_batch_dir(void *x, void *key);
 static void _get_batch_job_dir_ids(List batch_dirs);
 static int  _job_create(job_desc_msg_t * job_specs, uint32_t * new_job_id,
@@ -903,20 +905,42 @@ int kill_running_job_by_node_name(char *node_name)
 		      job_record_point->job_id, node_name);
 		job_count++;
 		if ((job_record_point->details == NULL) ||
-		    (job_record_point->kill_on_node_fail)) {
-			last_job_update = time(NULL);
+		    (job_record_point->kill_on_node_fail) ||
+		    (job_record_point->node_cnt <= 1)) {
 			job_record_point->job_state = JOB_NODE_FAIL;
 			job_record_point->end_time = time(NULL);
 			deallocate_nodes(job_record_point);
 			delete_job_details(job_record_point);
+		} else {
+			/* Remove node from this job's list */
+			_excise_node_from_job(job_record_point, 
+					      node_record_point);
 		}
 
 	}
 	list_iterator_destroy(job_record_iterator);
+	if (job_count)
+		last_job_update = time(NULL);
 
 	return job_count;
 }
 
+/* Remove one node from a job's allocation */
+static void _excise_node_from_job(struct job_record *job_record_ptr, 
+				  struct node_record *node_record_ptr)
+{
+	int bit_position;
+
+	bit_position = node_record_ptr - node_record_table_ptr;
+	bit_clear(job_record_ptr->node_bitmap, bit_position);
+	job_record_ptr->nodes = bitmap2node_name(job_record_ptr->node_bitmap);
+	FREE_NULL(job_record_ptr->cpus_per_node);
+	FREE_NULL(job_record_ptr->cpu_count_reps);
+	FREE_NULL(job_record_ptr->node_addr);
+
+	/* build_node_details rebuilds everything from node_bitmap */
+	build_node_details(job_record_ptr);
+}
 
 
 /*
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 0cb826dbcb5..9d7e0047c5f 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -950,8 +950,8 @@ void build_node_details(struct job_record *job_ptr)
 	}
 	hostlist_destroy(host_list);
 	if (job_ptr->node_cnt != node_inx) {
-		error("Node count mismatch for job_id %u",
-		      job_ptr->job_id);
+		error("Node count mismatch for job_id %u (%u,%u)",
+		      job_ptr->job_id, job_ptr->node_cnt, node_inx);
 		job_ptr->node_cnt = node_inx;
 	}
 	job_ptr->num_cpu_groups = cpu_inx + 1;
-- 
GitLab