diff --git a/src/slurmctld/backup.c b/src/slurmctld/backup.c index 528b4364794fe4e81b156480f4a6d019fa8b417b..047fc01f53d33f5826c65ec52c540a639ed4f5b9 100644 --- a/src/slurmctld/backup.c +++ b/src/slurmctld/backup.c @@ -78,9 +78,8 @@ void run_backup(void) { time_t last_controller_response = time(NULL), last_ping = 0; pthread_attr_t thread_attr_sig, thread_attr_rpc; - slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, - NO_LOCK, NO_LOCK - }; + slurmctld_lock_t config_read_lock = { + READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; info("slurmctld running in background mode"); /* default: don't resume if shutdown */ @@ -116,7 +115,7 @@ void run_backup(void) &thread_attr_sig, _background_signal_hand, NULL)) fatal("pthread_create %m"); - sleep(5); /* Give the primary slurmctld set-up time */ + sleep(5); /* Give the primary slurmctld set-up time */ /* repeatedly ping ControlMachine */ while (slurmctld_config.shutdown_time == 0) { sleep(1); @@ -186,9 +185,8 @@ static void *_background_signal_hand(void *no_data) int sig, error_code; sigset_t set; /* Locks: Write configuration, job, node, and partition */ - slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, - WRITE_LOCK, WRITE_LOCK - }; + slurmctld_lock_t config_write_lock = { + WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); @@ -200,7 +198,7 @@ static void *_background_signal_hand(void *no_data) sleep(1); } - while (1) { + while (slurmctld_config.shutdown_time == 0) { xsignal_sigset_create(backup_sigarray, &set); sigwait(&set, &sig); switch (sig) { @@ -208,7 +206,6 @@ static void *_background_signal_hand(void *no_data) case SIGTERM: /* kill -15 */ info("Terminate signal (SIGINT or SIGTERM) received"); slurmctld_config.shutdown_time = time(NULL); - /* send REQUEST_SHUTDOWN_IMMEDIATE RPC */ slurmctld_shutdown(); return NULL; /* Normal termination */ break; @@ -233,7 +230,6 @@ static void *_background_signal_hand(void *no_data) case SIGABRT: /* abort */ info("SIGABRT received"); slurmctld_config.shutdown_time = time(NULL); - /* send REQUEST_SHUTDOWN_IMMEDIATE RPC */ slurmctld_shutdown(); dump_core = true; return NULL; /* Normal termination */ @@ -242,6 +238,7 @@ static void *_background_signal_hand(void *no_data) error("Invalid signal (%d) received", sig); } } + return NULL; } /* Reset the job credential key based upon configuration parameters. @@ -252,6 +249,10 @@ static void _update_cred_key(void) slurmctld_conf.job_credential_private_key); } +static void _sig_handler(int signal) +{ +} + /* _background_rpc_mgr - Read and process incoming RPCs to the background * controller (that's us) */ static void *_background_rpc_mgr(void *no_data) @@ -260,11 +261,11 @@ static void *_background_rpc_mgr(void *no_data) slurm_fd sockfd; slurm_addr cli_addr; slurm_msg_t *msg = NULL; - bool done_flag = false; int error_code; - slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, - NO_LOCK, NO_LOCK - }; + /* Read configuration only */ + slurmctld_lock_t config_read_lock = { + READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; + int sigarray[] = {SIGUSR1, 0}; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); @@ -278,28 +279,38 @@ static void *_background_rpc_mgr(void *no_data) fatal("slurm_init_msg_engine_port error %m"); unlock_slurmctld(config_read_lock); + /* Prepare to catch SIGUSR1 to interrupt accept(). + * This signal is generated by the slurmctld signal + * handler thread upon receipt of SIGABRT, SIGINT, + * or SIGTERM. That thread does all processing of + * all signals. */ + xsignal(SIGUSR1, _sig_handler); + xsignal_unblock(sigarray); + /* * Process incoming RPCs indefinitely */ - while (done_flag == false) { + while (slurmctld_config.shutdown_time == 0) { /* accept needed for stream implementation * is a no-op in message implementation that just passes * sockfd to newsockfd */ if ((newsockfd = slurm_accept_msg_conn(sockfd, &cli_addr)) == SLURM_SOCKET_ERROR) { - error("slurm_accept_msg_conn error %m"); + if (errno != EINTR) + error("slurm_accept_msg_conn: %m"); continue; } msg = xmalloc(sizeof(slurm_msg_t)); msg->conn_fd = newsockfd; if (slurm_receive_msg(newsockfd, msg, 0) < 0) - error("slurm_receive_msg error %m"); + error("slurm_receive_msg: %m"); else { error_code = _background_process_msg(msg); if ((error_code == SLURM_SUCCESS) && - (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE)) - done_flag = true; + (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE) && + (slurmctld_config.shutdown_time == 0)) + slurmctld_config.shutdown_time = time(NULL); } slurm_free_msg(msg); @@ -354,9 +365,8 @@ static int _ping_controller(void) int rc; slurm_msg_t req; /* Locks: Read configuration */ - slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, - NO_LOCK, NO_LOCK - }; + slurmctld_lock_t config_read_lock = { + READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; /* * Set address of controller to ping diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index fce1f6732067eed4eb32fe2602d2093d0d1ae35c..2a4085d935d6984a605ab25bcf0df08723dad3f2 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -125,7 +125,7 @@ static void * _slurmctld_rpc_mgr(void *no_data); static void * _slurmctld_signal_hand(void *no_data); inline static void _update_cred_key(void); inline static void _usage(char *prog_name); -static void _wait_for_server_thread(void); +static bool _wait_for_server_thread(void); typedef struct connection_arg { int newsockfd; @@ -419,7 +419,6 @@ static void *_slurmctld_signal_hand(void *no_data) case SIGTERM: /* kill -15 */ info("Terminate signal (SIGINT or SIGTERM) received"); slurmctld_config.shutdown_time = time(NULL); - /* send REQUEST_SHUTDOWN_IMMEDIATE RPC */ slurmctld_shutdown(); return NULL; /* Normal termination */ break; @@ -444,7 +443,6 @@ static void *_slurmctld_signal_hand(void *no_data) case SIGABRT: /* abort */ info("SIGABRT received"); slurmctld_config.shutdown_time = time(NULL); - /* send REQUEST_SHUTDOWN_IMMEDIATE RPC */ slurmctld_shutdown(); dump_core = true; return NULL; @@ -470,6 +468,10 @@ static void _default_sigaction(int sig) error("sigaction(%d): %m", sig); } +static void _sig_handler(int signal) +{ +} + /* _slurmctld_rpc_mgr - Read incoming RPCs and create pthread for each */ static void *_slurmctld_rpc_mgr(void *no_data) { @@ -483,6 +485,7 @@ static void *_slurmctld_rpc_mgr(void *no_data) /* Locks: Read config */ slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; + int sigarray[] = {SIGUSR1, 0}; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); @@ -509,20 +512,28 @@ static void *_slurmctld_rpc_mgr(void *no_data) fatal("slurm_init_msg_engine_port error %m"); unlock_slurmctld(config_read_lock); + /* Prepare to catch SIGUSR1 to interrupt accept(). + * This signal is generated by the slurmctld signal + * handler thread upon receipt of SIGABRT, SIGINT, + * or SIGTERM. That thread does all processing of + * all signals. */ + xsignal(SIGUSR1, _sig_handler); + xsignal_unblock(sigarray); + /* - * Process incoming RPCs indefinitely + * Process incoming RPCs until told to shutdown */ - while (1) { + while (_wait_for_server_thread()) { /* * accept needed for stream implementation is a no-op in * message implementation that just passes sockfd to newsockfd */ - _wait_for_server_thread(); if ((newsockfd = slurm_accept_msg_conn(sockfd, &cli_addr)) == SLURM_SOCKET_ERROR) { _free_server_thread(); - error("slurm_accept_msg_conn error %m"); + if (errno != EINTR) + error("slurm_accept_msg_conn: %m"); continue; } conn_arg = xmalloc(sizeof(connection_arg_t)); @@ -533,14 +544,13 @@ static void *_slurmctld_rpc_mgr(void *no_data) &thread_attr_rpc_req, _service_connection, (void *) conn_arg)) { - error("pthread_create error %m"); + error("pthread_create: %m"); no_thread = 1; } else no_thread = 0; if (no_thread) { - if (_service_connection((void *) conn_arg)) - break; + _service_connection((void *) conn_arg); } } @@ -552,10 +562,10 @@ static void *_slurmctld_rpc_mgr(void *no_data) } /* - * _service_connection - service the RPC, return NULL except in case - * of REQUEST_SHUTDOWN_IMMEDIATE + * _service_connection - service the RPC * IN/OUT arg - really just the connection's file descriptor, freed * upon completion + * RET - NULL */ static void *_service_connection(void *arg) { @@ -572,8 +582,6 @@ static void *_service_connection(void *arg) } else info("_service_connection/slurm_receive_msg %m"); } else { - if (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE) - return_code = (void *) "fini"; msg->conn_fd = newsockfd; slurmctld_req (msg); /* process the request */ } @@ -590,13 +598,19 @@ static void *_service_connection(void *arg) } /* Increment slurmctld_config.server_thread_count and don't return - * until its value is no larger than MAX_SERVER_THREADS */ -static void _wait_for_server_thread(void) + * until its value is no larger than MAX_SERVER_THREADS, + * RET true unless shutdown in progress */ +static bool _wait_for_server_thread(void) { bool print_it = true; + bool rc = true; slurm_mutex_lock(&slurmctld_config.thread_count_lock); while (1) { + if (slurmctld_config.shutdown_time) { + rc = false; + break; + } if (slurmctld_config.server_thread_count < MAX_SERVER_THREADS) { slurmctld_config.server_thread_count++; @@ -612,6 +626,7 @@ static void _wait_for_server_thread(void) } } slurm_mutex_unlock(&slurmctld_config.thread_count_lock); + return rc; } static void _free_server_thread(void) @@ -830,43 +845,18 @@ static int _report_locks_set(void) } /* - * slurmctld_shutdown - issue RPC to have slurmctld shutdown, knocks - * loose an slurm_accept_msg_conn() if we have a thread hung there + * slurmctld_shutdown - wake up slurm_rpc_mgr thread via signal * RET 0 or error code */ int slurmctld_shutdown(void) { - int rc; - slurm_fd sockfd; - slurm_msg_t request_msg; - slurm_addr self; - - /* init message connection for message communication - * with self/controller */ - slurm_set_addr(&self, slurmctld_conf.slurmctld_port, "localhost"); - if ((sockfd = slurm_open_msg_conn(&self)) == SLURM_SOCKET_ERROR) { - error("slurmctld_shutdown/slurm_open_msg_conn: %m"); - return SLURM_SOCKET_ERROR; - } - - /* send request message */ - request_msg.msg_type = REQUEST_SHUTDOWN_IMMEDIATE; - - if ((rc = slurm_send_node_msg(sockfd, &request_msg)) - == SLURM_SOCKET_ERROR) { - error("slurmctld_shutdown/slurm_send_node_msg error: %m"); - return SLURM_SOCKET_ERROR; - } - - /* no response */ - - /* shutdown message connection */ - if ((rc = slurm_shutdown_msg_conn(sockfd)) == SLURM_SOCKET_ERROR) { - error("slurm_shutdown_msg_conn error"); - return SLURM_SOCKET_ERROR; + if (slurmctld_config.thread_id_rpc) { + pthread_kill(slurmctld_config.thread_id_rpc, SIGUSR1); + return SLURM_SUCCESS; + } else { + error("thread_id_rpc not set"); + return SLURM_ERROR; } - - return SLURM_PROTOCOL_SUCCESS; } /* Variables for commandline passing using getopt */ diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index d5f2a9e4252271d948a84fca7371e093937be0b4..d741510d32f58db179122d0b0bc8ec90e43a3ebd 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1021,8 +1021,7 @@ extern void set_slurmd_addr (void); extern void signal_step_tasks(struct step_record *step_ptr, uint16_t signal); /* - * slurmctld_shutdown - issue RPC to have slurmctld shutdown, knocks - * loose an slurm_accept_msg_conn() if we have a thread hung there + * slurmctld_shutdown - wake up slurm_rpc_mgr thread via signal * RET 0 or error code */ extern int slurmctld_shutdown(void);