From 6f1cb823c1a1103bf6cbc52adf8a3d051f774f0b Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 23 Sep 2002 21:35:54 +0000
Subject: [PATCH] Added support for ping RPC. slurmctld now will ping slurmd
 periodically and flag non-responsive nodes as such.

---
 src/slurmctld/agent.c      |  7 ++--
 src/slurmctld/controller.c | 50 ++++++++++++++++++++++---
 src/slurmctld/node_mgr.c   | 77 ++++++++++++++++++++++++++++++++++++++
 src/slurmctld/slurmctld.h  |  7 ++++
 4 files changed, 133 insertions(+), 8 deletions(-)

diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 1076e657204..73da1829d47 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -131,7 +131,8 @@ agent (void *args)
 		fatal ("agent passed NULL address list");
 	if (agent_arg_ptr->node_names == NULL)
 		fatal ("agent passed NULL node name list");
-	if (agent_arg_ptr->msg_type != REQUEST_REVOKE_JOB_CREDENTIAL)
+	if ((agent_arg_ptr->msg_type != REQUEST_REVOKE_JOB_CREDENTIAL) &&
+	    (agent_arg_ptr->msg_type != REQUEST_PING))
 		fatal ("agent passed invalid message type %d", agent_arg_ptr->msg_type);
 
 	/* initialize the data structures */
@@ -252,8 +253,8 @@ wdog (void *args)
 	agent_info_t *agent_ptr = (agent_info_t *) args;
 	thd_t *thread_ptr = agent_ptr->thread_struct;
 #if AGENT_IS_THREAD
-	/* Locks: Write node */
-	slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK };
+	/* Locks: Write job and write node */
+	slurmctld_lock_t node_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
 #else
 	int done_cnt;
 	char *slurm_names;
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 92204b369cb..075970c1fb3 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -98,6 +98,7 @@ inline static void slurm_rpc_job_step_create( slurm_msg_t* msg ) ;
 inline static void slurm_rpc_job_step_get_info ( slurm_msg_t * msg ) ;
 inline static void slurm_rpc_job_will_run ( slurm_msg_t * msg ) ;
 inline static void slurm_rpc_node_registration ( slurm_msg_t * msg ) ;
+inline static void slurm_rpc_ping ( slurm_msg_t * msg ) ;
 inline static void slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) ;
 inline static void slurm_rpc_shutdown_controller ( slurm_msg_t * msg );
 inline static void slurm_rpc_shutdown_controller_immediate ( slurm_msg_t * msg );
@@ -378,15 +379,19 @@ slurmctld_background ( void * no_data )
 {
 	static time_t last_sched_time;
 	static time_t last_checkpoint_time;
+	static time_t last_ping_time;
 	static time_t last_timelimit_time;
 	time_t now;
 	/* Locks: Write job, write node, read partition */
 	slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };
+	/* Locks: Write job, write node */
+	slurmctld_lock_t node_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
 	/* Locks: Write partition */
 	slurmctld_lock_t part_write_lock = { NO_LOCK, NO_LOCK, NO_LOCK, WRITE_LOCK };
 
 	/* Let the dust settle before doing work */
-	last_sched_time = last_checkpoint_time = last_timelimit_time = time (NULL);
+	last_sched_time = last_checkpoint_time = last_timelimit_time = 
+		last_ping_time = time (NULL);
 	(void) pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL);
 	(void) pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
 	debug3 ("slurmctld_background pid = %u", getpid ());
@@ -396,7 +401,7 @@ slurmctld_background ( void * no_data )
 
 		now = time (NULL);
 
-		if ((now - last_timelimit_time) > PERIODIC_TIMEOUT) {
+		if (difftime (now, last_timelimit_time) > PERIODIC_TIMEOUT) {
 			last_timelimit_time = now;
 			debug ("Performing job time limit check");
 			lock_slurmctld (job_write_lock);
@@ -404,14 +409,22 @@ slurmctld_background ( void * no_data )
 			unlock_slurmctld (job_write_lock);
 		}
 
-		if ((now - last_timelimit_time) > PERIODIC_GROUP_CHECK) {
+		if (difftime (now, last_ping_time) >= slurmctld_conf.heartbeat_interval) {
+			last_ping_time = now;
+			debug ("Performing node ping");
+			lock_slurmctld (node_write_lock);
+			ping_nodes ();
+			unlock_slurmctld (node_write_lock);
+		}
+
+		if (difftime (now, last_timelimit_time) >= PERIODIC_GROUP_CHECK) {
 			last_timelimit_time = now;
 			lock_slurmctld (part_write_lock);
 			load_part_uid_allow_list ( 0 );
 			unlock_slurmctld (part_write_lock);
 		}
 
-		if ((now - last_sched_time) > PERIODIC_SCHEDULE) {
+		if (difftime (now, last_sched_time) >= PERIODIC_SCHEDULE) {
 			last_sched_time = now;
 			debug ("Performing purge of old job records");
 			lock_slurmctld (job_write_lock);
@@ -421,7 +434,8 @@ slurmctld_background ( void * no_data )
 				last_checkpoint_time = 0;	/* force state save */
 		}
 
-		if (shutdown_time || (now - last_checkpoint_time) > PERIODIC_CHECKPOINT) {
+		if (shutdown_time || 
+		    (difftime (now, last_checkpoint_time) >= PERIODIC_CHECKPOINT) ) {
 			if (shutdown_time) {	
 				/* wait for any RPC's to complete */
 				if (server_thread_count)
@@ -578,6 +592,9 @@ slurmctld_req ( slurm_msg_t * msg )
 		case REQUEST_SHUTDOWN_IMMEDIATE:
 			slurm_rpc_shutdown_controller_immediate ( msg ) ;
 			break;
+		case REQUEST_PING:
+			slurm_rpc_ping ( msg ) ;
+			break;
 		case REQUEST_UPDATE_JOB:
 			slurm_rpc_update_job ( msg ) ;
 			slurm_free_job_desc_msg ( msg -> data ) ;
@@ -1376,6 +1393,29 @@ void slurm_rpc_job_will_run ( slurm_msg_t * msg )
 	}
 }
 
+/* slurm_rpc_ping - process ping RPC */
+void 
+slurm_rpc_ping ( slurm_msg_t * msg )
+{
+	/* init */
+	int error_code = SLURM_SUCCESS;
+#ifdef HAVE_AUTHD
+	uid_t uid = 0;
+#endif
+
+#ifdef	HAVE_AUTHD
+	uid = slurm_auth_uid (msg->cred);
+	if ( (uid != 0) && (uid != getuid ()) ) {
+		error ("Security violation, PING RPC from uid %u", (unsigned int) uid);
+		error_code = ESLURM_USER_ID_MISSING;
+	}
+#endif
+
+	/* return result */
+	slurm_send_rc_msg ( msg , error_code );
+}
+
+
 /* slurm_rpc_reconfigure_controller - process RPC to re-initialize slurmctld from configuration file */
 void 
 slurm_rpc_reconfigure_controller ( slurm_msg_t * msg )
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index a354ac3fc4a..582e8314ec0 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -35,6 +35,7 @@
 #include <errno.h>
 #include <stdio.h>
 #include <string.h>
+#include <time.h>
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -42,6 +43,7 @@
 
 #include <src/common/hostlist.h>
 #include <src/common/xstring.h>
+#include <src/slurmctld/agent.h>
 #include <src/slurmctld/locks.h>
 #include <src/slurmctld/slurmctld.h>
 
@@ -1089,6 +1091,13 @@ node_did_resp (char *name)
 	i = node_ptr - node_record_table_ptr;
 	last_node_update = time (NULL);
 	node_record_table_ptr[i].last_response = time (NULL);
+	node_ptr->node_state &= (uint16_t) (~NODE_STATE_NO_RESPOND);
+	if (node_ptr->node_state == NODE_STATE_UNKNOWN)
+		node_ptr->node_state = NODE_STATE_IDLE;
+	if (node_ptr->node_state == NODE_STATE_IDLE)
+		bit_set (idle_node_bitmap, (node_ptr - node_record_table_ptr));
+	if (node_ptr->node_state != NODE_STATE_DOWN)
+		bit_set (up_node_bitmap, (node_ptr - node_record_table_ptr));
 	return;
 }
 
@@ -1106,11 +1115,79 @@ node_not_resp (char *name)
 	}
 
 	i = node_ptr - node_record_table_ptr;
+	if (node_record_table_ptr[i].node_state & NODE_STATE_NO_RESPOND)
+		return;		/* Already known to be not responding */
+
 	last_node_update = time (NULL);
 	error ("Node %s not responding", name);
 	bit_clear (up_node_bitmap, i);
 	bit_clear (idle_node_bitmap, i);
 	node_record_table_ptr[i].node_state |= NODE_STATE_NO_RESPOND;
+	kill_running_job_by_node_name (node_record_table_ptr[i].name);
 	return;
 }
 
+/* ping_nodes - check that all nodes and daemons are alive */
+void 
+ping_nodes (void)
+{
+	int i, age;
+	int buf_rec_size = 0;
+	time_t now;
+	agent_arg_t *agent_args;
+	pthread_attr_t attr_agent;
+	pthread_t thread_agent;
+
+	agent_args = xmalloc (sizeof (agent_arg_t));
+	agent_args->msg_type = REQUEST_PING;
+	now = time(NULL);
+	for (i = 0; i < node_record_count; i++) {
+		if (node_record_table_ptr[i].node_state == NODE_STATE_DOWN)
+			continue;
+
+		age = difftime (now, node_record_table_ptr[i].last_response);
+		if (age < slurmctld_conf.heartbeat_interval)
+			continue;
+
+		debug3 ("ping %s now", node_record_table_ptr[i].name);
+		if ((agent_args->addr_count+1) > buf_rec_size) {
+			buf_rec_size += 32;
+			xrealloc ((agent_args->slurm_addr), (sizeof (struct sockaddr_in) * buf_rec_size));
+			xrealloc ((agent_args->node_names), (MAX_NAME_LEN * buf_rec_size));
+		}
+		agent_args->slurm_addr[agent_args->addr_count] = node_record_table_ptr[i].slurm_addr;
+		strncpy (&agent_args->node_names[MAX_NAME_LEN*agent_args->addr_count],
+		         node_record_table_ptr[i].name, MAX_NAME_LEN);
+		agent_args->addr_count++;
+
+		if (age >= slurmctld_conf.slurmd_timeout) {
+			error ("node %s not responding", node_record_table_ptr[i].name);
+			last_node_update = time (NULL);
+			bit_clear (up_node_bitmap, i);
+			bit_clear (idle_node_bitmap, i);
+			node_record_table_ptr[i].node_state |= NODE_STATE_NO_RESPOND;
+			kill_running_job_by_node_name (node_record_table_ptr[i].name);
+		}
+	}
+
+	if (agent_args->addr_count == 0) {
+		xfree (agent_args);
+		return;
+	}
+	debug ("Spawning ping agent");
+	if (pthread_attr_init (&attr_agent))
+		fatal ("pthread_attr_init error %m");
+	if (pthread_attr_setdetachstate (&attr_agent, PTHREAD_CREATE_DETACHED))
+		error ("pthread_attr_setdetachstate error %m");
+#ifdef PTHREAD_SCOPE_SYSTEM
+	if (pthread_attr_setscope (&attr_agent, PTHREAD_SCOPE_SYSTEM))
+		error ("pthread_attr_setscope error %m");
+#endif
+	if (pthread_create (&thread_agent, &attr_agent, agent, (void *)agent_args)) {
+		error ("pthread_create error %m");
+		sleep (1); /* sleep and try once more */
+		if (pthread_create (&thread_agent, &attr_agent, agent, (void *)agent_args))
+			fatal ("pthread_create error %m");
+	}
+}
+
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 59dcc929031..a6a40dc95e2 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -369,6 +369,10 @@ extern int job_step_complete (uint32_t job_id, uint32_t job_step_id, uid_t uid);
 /* job_time_limit - enforce job time limits */
 extern void job_time_limit (void);
 
+/* kill_running_job_by_node_name - Given a node name, deallocate that job 
+ *	from the node or kill it */
+extern int kill_running_job_by_node_name (char *node_name);
+
 /* list_append_list - Appends the elements of from list onto the to list */
 extern void list_append_list( List to, List from );
 	
@@ -456,6 +460,9 @@ extern void pack_job (struct job_record *dump_job_ptr, void **buf_ptr, int *buf_
  */
 extern void pack_part (struct part_record *part_record_point, void **buf_ptr, int *buf_len);
 
+/* ping_nodes - check that all nodes and daemons are alive */
+extern void ping_nodes (void);
+
 /*
  * purge_old_job - purge old job records. if memory space is needed. 
  *	the jobs must have completed at least MIN_JOB_AGE minutes ago
-- 
GitLab