From fc65aaf3f36221fd4616716665b26e274aaabd1f Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 23 Jan 2003 22:54:11 +0000
Subject: [PATCH] Make nodes DOWN and log event on slurmd prolog/epilog
 failure.

---
 src/slurmctld/agent.c          | 56 ++++++++++++++++++++++------------
 src/slurmctld/node_mgr.c       | 26 ++++++++++++++++
 src/slurmctld/node_scheduler.c |  4 ++-
 src/slurmctld/slurmctld.h      |  4 +++
 4 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 7e0707ed85e..2829f20ef75 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -73,7 +73,8 @@
 #  define WDOG_POLL 		2	/* secs */
 #endif
 
-typedef enum { DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_FAILED } state_t;
+typedef enum { DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_NO_RESP, 
+	       DSH_FAILED } state_t;
 
 typedef struct thd {
 	pthread_t thread;		/* thread ID */
@@ -296,7 +297,8 @@ static int _valid_agent_arg(agent_arg_t *agent_arg_ptr)
  */
 static void *_wdog(void *args)
 {
-	int i, fail_cnt, work_done, delay, max_delay = 0;
+	int fail_cnt, no_resp_cnt, work_done;
+	int i, delay, max_delay = 0;
 	agent_info_t *agent_ptr = (agent_info_t *) args;
 	thd_t *thread_ptr = agent_ptr->thread_struct;
 #if AGENT_IS_THREAD
@@ -309,8 +311,10 @@ static void *_wdog(void *args)
 #endif
 
 	while (1) {
-		work_done = 1;	/* assume all threads complete for now */
-		fail_cnt = 0;	/* assume all threads complete sucessfully for now */
+		work_done   = 1;	/* assume all threads complete */
+		fail_cnt    = 0;	/* assume no threads failures */
+		no_resp_cnt = 0;	/* assume all threads respond */
+
 		sleep(WDOG_POLL);
 
 		slurm_mutex_lock(&agent_ptr->thread_mutex);
@@ -332,6 +336,9 @@ static void *_wdog(void *args)
 					max_delay =
 					    (int) thread_ptr[i].time;
 				break;
+			case DSH_NO_RESP:
+				no_resp_cnt++;
+				break;
 			case DSH_FAILED:
 				fail_cnt++;
 				break;
@@ -343,12 +350,12 @@ static void *_wdog(void *args)
 	}
 
 	/* Notify slurmctld of non-responding nodes */
-	if (fail_cnt) {
+	if (no_resp_cnt) {
 #if AGENT_IS_THREAD
 		/* Update node table data for non-responding nodes */
 		lock_slurmctld(node_write_lock);
 		for (i = 0; i < agent_ptr->thread_count; i++) {
-			if (thread_ptr[i].state == DSH_FAILED)
+			if (thread_ptr[i].state == DSH_NO_RESP)
 				node_not_resp(thread_ptr[i].node_name);
 		}
 		unlock_slurmctld(node_write_lock);
@@ -358,7 +365,7 @@ static void *_wdog(void *args)
 		slurm_names = xmalloc(fail_cnt * MAX_NAME_LEN);
 		fail_cnt = 0;
 		for (i = 0; i < agent_ptr->thread_count; i++) {
-			if (thread_ptr[i].state == DSH_FAILED) {
+			if (thread_ptr[i].state == DSH_NO_RESP) {
 				strncpy(&slurm_names
 					[MAX_NAME_LEN * fail_cnt],
 					thread_ptr[i].node_name,
@@ -383,6 +390,8 @@ static void *_wdog(void *args)
 	/* Update last_response on responding nodes */
 	lock_slurmctld(node_write_lock);
 	for (i = 0; i < agent_ptr->thread_count; i++) {
+		if (thread_ptr[i].state == DSH_FAILED)
+			set_node_down(thread_ptr[i].node_name);
 		if (thread_ptr[i].state == DSH_DONE)
 			node_did_resp(thread_ptr[i].node_name);
 	}
@@ -390,16 +399,17 @@ static void *_wdog(void *args)
 #else
 	/* Build a list of all responding nodes and send it to slurmctld to 
 	 * update time stamps */
-	done_cnt = agent_ptr->thread_count - fail_cnt;
+	done_cnt = agent_ptr->thread_count - fail_cnt - no_resp_cnt;
 	slurm_names = xmalloc(done_cnt * MAX_NAME_LEN);
 	done_cnt = 0;
 	for (i = 0; i < agent_ptr->thread_count; i++) {
-		if (thread_ptr[i].state == DSH_DONE) {
+		if (thread_ptr[i].state == DSH_DONE)
 			strncpy(&slurm_names[MAX_NAME_LEN * done_cnt],
 				thread_ptr[i].node_name, MAX_NAME_LEN);
 			done_cnt++;
 		}
 	}
+	/* need support for node failures here too */
 
 	/* send RPC */
 	fatal("Code development needed here if agent is not thread");
@@ -428,7 +438,7 @@ static void *_thread_per_node_rpc(void *args)
 	return_code_msg_t *slurm_rc_msg;
 	task_info_t *task_ptr = (task_info_t *) args;
 	thd_t *thread_ptr = task_ptr->thread_struct_ptr;
-	state_t thread_state = DSH_FAILED;
+	state_t thread_state = DSH_NO_RESP;
 	sigset_t set;
 
 	/* set up SIGALRM handler */
@@ -494,19 +504,27 @@ static void *_thread_per_node_rpc(void *args)
 		slurm_rc_msg = (return_code_msg_t *) response_msg->data;
 		rc = slurm_rc_msg->return_code;
 		slurm_free_return_code_msg(slurm_rc_msg);
-		if (rc)
-			error("_thread_per_node_rpc/rc error from host %s: %s",
+		if (rc == 0) {
+			debug3("agent processed RPC to node %s",
+			       thread_ptr->node_name);
+			thread_state = DSH_DONE;
+		} else if (rc == ESLURMD_EPILOG_FAILED) {
+			error("Epilog failure on host %s, setting DOWN",
+			      thread_ptr->node_name);
+			thread_state = DSH_FAILED;
+		} else if (rc == ESLURMD_PROLOG_FAILED) {
+			error("Prolog failure on host %s, setting DOWN",
+			      thread_ptr->node_name);
+			thread_state = DSH_FAILED;
+		} else {
+			error("agent error from host %s: %s",
 			      thread_ptr->node_name,
 			      slurm_strerror(rc));	/* Don't use %m */
-		else {
-			debug3
-			    ("agent sucessfully processed RPC to node %s",
-			     thread_ptr->node_name);
+			thread_state = DSH_DONE;
 		}
-		thread_state = DSH_DONE;
 		break;
 	default:
-		error("_thread_per_node_rpc from host %s, bad msg_type %d",
+		error("agent reply from host %s, bad msg_type %d",
 		      thread_ptr->node_name, response_msg->msg_type);
 		break;
 	}
@@ -578,7 +596,7 @@ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count)
 
 	j = 0;
 	for (i = 0; i < agent_info_ptr->thread_count; i++) {
-		if (thread_ptr[i].state != DSH_FAILED)
+		if (thread_ptr[i].state != DSH_NO_RESP)
 			continue;
 		agent_arg_ptr->slurm_addr[j] = thread_ptr[i].slurm_addr;
 		strncpy(&agent_arg_ptr->node_names[j * MAX_NAME_LEN],
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index 57a2765405b..14fa07b5bc7 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -1194,6 +1194,32 @@ void node_not_resp (char *name)
 	return;
 }
 
+/* set_node_down - make the specified node's state DOWN, kill jobs as needed 
+ * IN name - name of the node */
+void set_node_down (char *name)
+{
+	struct node_record *node_ptr;
+	int node_inx;
+	uint16_t resp_state;
+
+	node_ptr = find_node_record (name);
+	if (node_ptr == NULL) {
+		error ("node_not_resp unable to find node %s", name);
+		return;
+	}
+
+	node_inx = node_ptr - node_record_table_ptr;
+	last_node_update = time (NULL);
+	/* preserve NODE_STATE_NO_RESPOND flag if set */
+	resp_state = node_ptr->node_state & NODE_STATE_NO_RESPOND;
+	node_ptr->node_state = NODE_STATE_DOWN | resp_state;
+	bit_clear (up_node_bitmap,   node_inx);
+	bit_clear (idle_node_bitmap, node_inx);
+	(void) kill_running_job_by_node_name(name, false);
+
+	return;
+}
+
 /* ping_nodes - check that all nodes and daemons are alive,  
  *	get nodes in UNKNOWN state to register */
 void ping_nodes (void)
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index cc01d77552e..fdd3ae0fc60 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -211,7 +211,9 @@ void make_node_idle(struct node_record *node_ptr)
 
 	base_state   = node_ptr->node_state & (~NODE_STATE_NO_RESPOND);
 	no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
-	if (base_state == NODE_STATE_DRAINING) {
+	if (base_state == NODE_STATE_DOWN) {
+		debug3("Node %s being left DOWN", node_ptr->name);
+	} else if (base_state == NODE_STATE_DRAINING) {
 		node_ptr->node_state = NODE_STATE_DRAINED;
 		bit_clear(idle_node_bitmap, inx);
 		bit_clear(up_node_bitmap, inx);
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 27b78407c64..3f985ba7ee8 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -883,6 +883,10 @@ extern int select_nodes (struct job_record *job_ptr, bool test_only);
  */
 extern int set_batch_job_sid(uid_t uid, uint32_t job_id, uint32_t batch_sid);
 
+/* set_node_down - make the specified node's state DOWN 
+ * IN name - name of the node */
+extern void set_node_down (char *name);
+
 /* set_slurmd_addr - establish the slurm_addr for the slurmd on each node
  *	Uses common data structures. */
 extern void set_slurmd_addr (void);
-- 
GitLab