diff --git a/NEWS b/NEWS index 82c9f8ed6579864bdf957a1ae4bbee4b1bc0b5f6..a3f85146e49ee7d9f08c6e56cbcfbdf8bcf27d85 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ documents those changes that are of interest to users and administrators. * Changes in Slurm 14.11.9 ========================== + -- Do not count slurmctld threads waiting in a "throttle" lock against the + daemon's thread limit as they are not contending for resources. * Changes in Slurm 14.11.8 ========================== diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 7ddb2558b6d195e7bc122956b8b4ddf9eea378b0..4dc0a816371a22cae3218b5920d8a6be4212f74f 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -214,7 +214,6 @@ static int _accounting_mark_all_nodes_down(char *reason); static void * _assoc_cache_mgr(void *no_data); static void _become_slurm_user(void); static void _default_sigaction(int sig); -inline static void _free_server_thread(void); static void _init_config(void); static void _init_pidfile(void); static void _kill_old_slurmctld(void); @@ -514,9 +513,7 @@ int main(int argc, char *argv[]) /* * create attached thread to process RPCs */ - slurm_mutex_lock(&slurmctld_config.thread_count_lock); - slurmctld_config.server_thread_count++; - slurm_mutex_unlock(&slurmctld_config.thread_count_lock); + server_thread_incr(); slurm_attr_init(&thread_attr); while (pthread_create(&slurmctld_config.thread_id_rpc, &thread_attr, _slurmctld_rpc_mgr, @@ -965,7 +962,7 @@ static void *_slurmctld_rpc_mgr(void *no_data) if (select(max_fd+1, &rfds, NULL, NULL, NULL) == -1) { if (errno != EINTR) error("slurm_accept_msg_conn select: %m"); - _free_server_thread(); + server_thread_decr(); continue; } /* find one to process */ @@ -986,7 +983,7 @@ static void *_slurmctld_rpc_mgr(void *no_data) SLURM_SOCKET_ERROR) { if (errno != EINTR) error("slurm_accept_msg_conn: %m"); - _free_server_thread(); + server_thread_decr(); continue; } fd_set_close_on_exec(newsockfd); @@ -1025,7 +1022,7 @@ static void *_slurmctld_rpc_mgr(void *no_data) for (i=0; i<nports; i++) (void) slurm_shutdown_msg_engine(sockfd[i]); xfree(sockfd); - _free_server_thread(); + server_thread_decr(); pthread_exit((void *) 0); return NULL; } @@ -1076,7 +1073,7 @@ static void *_service_connection(void *arg) cleanup: slurm_free_msg(msg); xfree(arg); - _free_server_thread(); + server_thread_decr(); return return_code; } @@ -1123,7 +1120,8 @@ static bool _wait_for_server_thread(void) return rc; } -static void _free_server_thread(void) +/* Decrement slurmctld thread count (as applies to thread limit) */ +extern void server_thread_decr(void) { slurm_mutex_lock(&slurmctld_config.thread_count_lock); if (slurmctld_config.server_thread_count > 0) @@ -1134,7 +1132,15 @@ static void _free_server_thread(void) slurm_mutex_unlock(&slurmctld_config.thread_count_lock); } -static int _accounting_cluster_ready() +/* Increment slurmctld thread count (as applies to thread limit) */ +extern void server_thread_incr(void) +{ + slurm_mutex_lock(&slurmctld_config.thread_count_lock); + slurmctld_config.server_thread_count++; + slurm_mutex_unlock(&slurmctld_config.thread_count_lock); +} + +static int _accounting_cluster_ready(void) { int rc = SLURM_ERROR; time_t event_time = time(NULL); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 92e3697b11419834564cfed4a131592f75530a91..1a23dd4238a3242f33cf6233ae3e928cce0d2bc6 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -565,7 +565,13 @@ static void _throttle_start(int *active_rpc_cnt) (*active_rpc_cnt)++; break; } + + /* While an RPC is being throttled due to a running RPC of the + * same type, do not count that thread against the daemon's + * thread limit */ + server_thread_decr(); pthread_cond_wait(&throttle_cond, &throttle_mutex); + server_thread_incr(); } slurm_mutex_unlock(&throttle_mutex); if (LOTS_OF_AGENTS) diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index aefc4bd12df47a57f37908165a4ac201528509e4..e4e691d9b66cfc98f0ce87f6b9115610a0b92e90 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1866,13 +1866,19 @@ extern void set_cluster_cpus(void); /* sends all jobs in eligible state to accounting. Only needed at * first registration */ -extern int send_jobs_to_accounting(); +extern int send_jobs_to_accounting(void); /* send all nodes in a down like state to accounting. Only needed at * first registration */ extern int send_nodes_to_accounting(time_t event_time); +/* Decrement slurmctld thread count (as applies to thread limit) */ +extern void server_thread_decr(void); + +/* Increment slurmctld thread count (as applies to thread limit) */ +extern void server_thread_incr(void); + /* Set a job's alias_list string */ extern void set_job_alias_list(struct job_record *job_ptr);