From 6f1cb823c1a1103bf6cbc52adf8a3d051f774f0b Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 23 Sep 2002 21:35:54 +0000 Subject: [PATCH] Added support for ping RPC. slurmctld now will ping slurmd periodically and flag non-responsive nodes as such. --- src/slurmctld/agent.c | 7 ++-- src/slurmctld/controller.c | 50 ++++++++++++++++++++++--- src/slurmctld/node_mgr.c | 77 ++++++++++++++++++++++++++++++++++++++ src/slurmctld/slurmctld.h | 7 ++++ 4 files changed, 133 insertions(+), 8 deletions(-) diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 1076e657204..73da1829d47 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -131,7 +131,8 @@ agent (void *args) fatal ("agent passed NULL address list"); if (agent_arg_ptr->node_names == NULL) fatal ("agent passed NULL node name list"); - if (agent_arg_ptr->msg_type != REQUEST_REVOKE_JOB_CREDENTIAL) + if ((agent_arg_ptr->msg_type != REQUEST_REVOKE_JOB_CREDENTIAL) && + (agent_arg_ptr->msg_type != REQUEST_PING)) fatal ("agent passed invalid message type %d", agent_arg_ptr->msg_type); /* initialize the data structures */ @@ -252,8 +253,8 @@ wdog (void *args) agent_info_t *agent_ptr = (agent_info_t *) args; thd_t *thread_ptr = agent_ptr->thread_struct; #if AGENT_IS_THREAD - /* Locks: Write node */ - slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; + /* Locks: Write job and write node */ + slurmctld_lock_t node_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; #else int done_cnt; char *slurm_names; diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 92204b369cb..075970c1fb3 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -98,6 +98,7 @@ inline static void slurm_rpc_job_step_create( slurm_msg_t* msg ) ; inline static void slurm_rpc_job_step_get_info ( slurm_msg_t * msg ) ; inline static void slurm_rpc_job_will_run ( slurm_msg_t * msg ) ; inline static void slurm_rpc_node_registration ( slurm_msg_t * msg ) ; +inline static void slurm_rpc_ping ( slurm_msg_t * msg ) ; inline static void slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) ; inline static void slurm_rpc_shutdown_controller ( slurm_msg_t * msg ); inline static void slurm_rpc_shutdown_controller_immediate ( slurm_msg_t * msg ); @@ -378,15 +379,19 @@ slurmctld_background ( void * no_data ) { static time_t last_sched_time; static time_t last_checkpoint_time; + static time_t last_ping_time; static time_t last_timelimit_time; time_t now; /* Locks: Write job, write node, read partition */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; + /* Locks: Write job, write node */ + slurmctld_lock_t node_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; /* Locks: Write partition */ slurmctld_lock_t part_write_lock = { NO_LOCK, NO_LOCK, NO_LOCK, WRITE_LOCK }; /* Let the dust settle before doing work */ - last_sched_time = last_checkpoint_time = last_timelimit_time = time (NULL); + last_sched_time = last_checkpoint_time = last_timelimit_time = + last_ping_time = time (NULL); (void) pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, NULL); debug3 ("slurmctld_background pid = %u", getpid ()); @@ -396,7 +401,7 @@ slurmctld_background ( void * no_data ) now = time (NULL); - if ((now - last_timelimit_time) > PERIODIC_TIMEOUT) { + if (difftime (now, last_timelimit_time) > PERIODIC_TIMEOUT) { last_timelimit_time = now; debug ("Performing job time limit check"); lock_slurmctld (job_write_lock); @@ -404,14 +409,22 @@ slurmctld_background ( void * no_data ) unlock_slurmctld (job_write_lock); } - if ((now - last_timelimit_time) > PERIODIC_GROUP_CHECK) { + if (difftime (now, last_ping_time) >= slurmctld_conf.heartbeat_interval) { + last_ping_time = now; + debug ("Performing node ping"); + lock_slurmctld (node_write_lock); + ping_nodes (); + unlock_slurmctld (node_write_lock); + } + + if (difftime (now, last_timelimit_time) >= PERIODIC_GROUP_CHECK) { last_timelimit_time = now; lock_slurmctld (part_write_lock); load_part_uid_allow_list ( 0 ); unlock_slurmctld (part_write_lock); } - if ((now - last_sched_time) > PERIODIC_SCHEDULE) { + if (difftime (now, last_sched_time) >= PERIODIC_SCHEDULE) { last_sched_time = now; debug ("Performing purge of old job records"); lock_slurmctld (job_write_lock); @@ -421,7 +434,8 @@ slurmctld_background ( void * no_data ) last_checkpoint_time = 0; /* force state save */ } - if (shutdown_time || (now - last_checkpoint_time) > PERIODIC_CHECKPOINT) { + if (shutdown_time || + (difftime (now, last_checkpoint_time) >= PERIODIC_CHECKPOINT) ) { if (shutdown_time) { /* wait for any RPC's to complete */ if (server_thread_count) @@ -578,6 +592,9 @@ slurmctld_req ( slurm_msg_t * msg ) case REQUEST_SHUTDOWN_IMMEDIATE: slurm_rpc_shutdown_controller_immediate ( msg ) ; break; + case REQUEST_PING: + slurm_rpc_ping ( msg ) ; + break; case REQUEST_UPDATE_JOB: slurm_rpc_update_job ( msg ) ; slurm_free_job_desc_msg ( msg -> data ) ; @@ -1376,6 +1393,29 @@ void slurm_rpc_job_will_run ( slurm_msg_t * msg ) } } +/* slurm_rpc_ping - process ping RPC */ +void +slurm_rpc_ping ( slurm_msg_t * msg ) +{ + /* init */ + int error_code = SLURM_SUCCESS; +#ifdef HAVE_AUTHD + uid_t uid = 0; +#endif + +#ifdef HAVE_AUTHD + uid = slurm_auth_uid (msg->cred); + if ( (uid != 0) && (uid != getuid ()) ) { + error ("Security violation, PING RPC from uid %u", (unsigned int) uid); + error_code = ESLURM_USER_ID_MISSING; + } +#endif + + /* return result */ + slurm_send_rc_msg ( msg , error_code ); +} + + /* slurm_rpc_reconfigure_controller - process RPC to re-initialize slurmctld from configuration file */ void slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index a354ac3fc4a..582e8314ec0 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -35,6 +35,7 @@ #include <errno.h> #include <stdio.h> #include <string.h> +#include <time.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> @@ -42,6 +43,7 @@ #include <src/common/hostlist.h> #include <src/common/xstring.h> +#include <src/slurmctld/agent.h> #include <src/slurmctld/locks.h> #include <src/slurmctld/slurmctld.h> @@ -1089,6 +1091,13 @@ node_did_resp (char *name) i = node_ptr - node_record_table_ptr; last_node_update = time (NULL); node_record_table_ptr[i].last_response = time (NULL); + node_ptr->node_state &= (uint16_t) (~NODE_STATE_NO_RESPOND); + if (node_ptr->node_state == NODE_STATE_UNKNOWN) + node_ptr->node_state = NODE_STATE_IDLE; + if (node_ptr->node_state == NODE_STATE_IDLE) + bit_set (idle_node_bitmap, (node_ptr - node_record_table_ptr)); + if (node_ptr->node_state != NODE_STATE_DOWN) + bit_set (up_node_bitmap, (node_ptr - node_record_table_ptr)); return; } @@ -1106,11 +1115,79 @@ node_not_resp (char *name) } i = node_ptr - node_record_table_ptr; + if (node_record_table_ptr[i].node_state & NODE_STATE_NO_RESPOND) + return; /* Already known to be not responding */ + last_node_update = time (NULL); error ("Node %s not responding", name); bit_clear (up_node_bitmap, i); bit_clear (idle_node_bitmap, i); node_record_table_ptr[i].node_state |= NODE_STATE_NO_RESPOND; + kill_running_job_by_node_name (node_record_table_ptr[i].name); return; } +/* ping_nodes - check that all nodes and daemons are alive */ +void +ping_nodes (void) +{ + int i, age; + int buf_rec_size = 0; + time_t now; + agent_arg_t *agent_args; + pthread_attr_t attr_agent; + pthread_t thread_agent; + + agent_args = xmalloc (sizeof (agent_arg_t)); + agent_args->msg_type = REQUEST_PING; + now = time(NULL); + for (i = 0; i < node_record_count; i++) { + if (node_record_table_ptr[i].node_state == NODE_STATE_DOWN) + continue; + + age = difftime (now, node_record_table_ptr[i].last_response); + if (age < slurmctld_conf.heartbeat_interval) + continue; + + debug3 ("ping %s now", node_record_table_ptr[i].name); + if ((agent_args->addr_count+1) > buf_rec_size) { + buf_rec_size += 32; + xrealloc ((agent_args->slurm_addr), (sizeof (struct sockaddr_in) * buf_rec_size)); + xrealloc ((agent_args->node_names), (MAX_NAME_LEN * buf_rec_size)); + } + agent_args->slurm_addr[agent_args->addr_count] = node_record_table_ptr[i].slurm_addr; + strncpy (&agent_args->node_names[MAX_NAME_LEN*agent_args->addr_count], + node_record_table_ptr[i].name, MAX_NAME_LEN); + agent_args->addr_count++; + + if (age >= slurmctld_conf.slurmd_timeout) { + error ("node %s not responding", node_record_table_ptr[i].name); + last_node_update = time (NULL); + bit_clear (up_node_bitmap, i); + bit_clear (idle_node_bitmap, i); + node_record_table_ptr[i].node_state |= NODE_STATE_NO_RESPOND; + kill_running_job_by_node_name (node_record_table_ptr[i].name); + } + } + + if (agent_args->addr_count == 0) { + xfree (agent_args); + return; + } + debug ("Spawning ping agent"); + if (pthread_attr_init (&attr_agent)) + fatal ("pthread_attr_init error %m"); + if (pthread_attr_setdetachstate (&attr_agent, PTHREAD_CREATE_DETACHED)) + error ("pthread_attr_setdetachstate error %m"); +#ifdef PTHREAD_SCOPE_SYSTEM + if (pthread_attr_setscope (&attr_agent, PTHREAD_SCOPE_SYSTEM)) + error ("pthread_attr_setscope error %m"); +#endif + if (pthread_create (&thread_agent, &attr_agent, agent, (void *)agent_args)) { + error ("pthread_create error %m"); + sleep (1); /* sleep and try once more */ + if (pthread_create (&thread_agent, &attr_agent, agent, (void *)agent_args)) + fatal ("pthread_create error %m"); + } +} + diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 59dcc929031..a6a40dc95e2 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -369,6 +369,10 @@ extern int job_step_complete (uint32_t job_id, uint32_t job_step_id, uid_t uid); /* job_time_limit - enforce job time limits */ extern void job_time_limit (void); +/* kill_running_job_by_node_name - Given a node name, deallocate that job + * from the node or kill it */ +extern int kill_running_job_by_node_name (char *node_name); + /* list_append_list - Appends the elements of from list onto the to list */ extern void list_append_list( List to, List from ); @@ -456,6 +460,9 @@ extern void pack_job (struct job_record *dump_job_ptr, void **buf_ptr, int *buf_ */ extern void pack_part (struct part_record *part_record_point, void **buf_ptr, int *buf_len); +/* ping_nodes - check that all nodes and daemons are alive */ +extern void ping_nodes (void); + /* * purge_old_job - purge old job records. if memory space is needed. * the jobs must have completed at least MIN_JOB_AGE minutes ago -- GitLab