From ad9c2413a735a6b15a0133ac5068d546c52de9a1 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Thu, 9 Jul 2015 11:02:24 -0700 Subject: [PATCH] Change slurmctld threads count against limit The slurmctld logic throttles some RPCs so that only one of them can execute at a time in order to reduce contention for the job, partition and node locks (only one of the effected RPCs can execute at any time anyway and this lets other RPC types run). While an RPC is stuck in the throttle function, do not count that thread against the slurmctld thread limit. but 1794 --- NEWS | 2 ++ src/slurmctld/controller.c | 26 ++++++++++++++++---------- src/slurmctld/proc_req.c | 6 ++++++ src/slurmctld/slurmctld.h | 8 +++++++- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/NEWS b/NEWS index 82c9f8ed657..a3f85146e49 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ documents those changes that are of interest to users and administrators. * Changes in Slurm 14.11.9 ========================== + -- Do not count slurmctld threads waiting in a "throttle" lock against the + daemon's thread limit as they are not contending for resources. * Changes in Slurm 14.11.8 ========================== diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 7ddb2558b6d..4dc0a816371 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -214,7 +214,6 @@ static int _accounting_mark_all_nodes_down(char *reason); static void * _assoc_cache_mgr(void *no_data); static void _become_slurm_user(void); static void _default_sigaction(int sig); -inline static void _free_server_thread(void); static void _init_config(void); static void _init_pidfile(void); static void _kill_old_slurmctld(void); @@ -514,9 +513,7 @@ int main(int argc, char *argv[]) /* * create attached thread to process RPCs */ - slurm_mutex_lock(&slurmctld_config.thread_count_lock); - slurmctld_config.server_thread_count++; - slurm_mutex_unlock(&slurmctld_config.thread_count_lock); + server_thread_incr(); slurm_attr_init(&thread_attr); while (pthread_create(&slurmctld_config.thread_id_rpc, &thread_attr, _slurmctld_rpc_mgr, @@ -965,7 +962,7 @@ static void *_slurmctld_rpc_mgr(void *no_data) if (select(max_fd+1, &rfds, NULL, NULL, NULL) == -1) { if (errno != EINTR) error("slurm_accept_msg_conn select: %m"); - _free_server_thread(); + server_thread_decr(); continue; } /* find one to process */ @@ -986,7 +983,7 @@ static void *_slurmctld_rpc_mgr(void *no_data) SLURM_SOCKET_ERROR) { if (errno != EINTR) error("slurm_accept_msg_conn: %m"); - _free_server_thread(); + server_thread_decr(); continue; } fd_set_close_on_exec(newsockfd); @@ -1025,7 +1022,7 @@ static void *_slurmctld_rpc_mgr(void *no_data) for (i=0; i<nports; i++) (void) slurm_shutdown_msg_engine(sockfd[i]); xfree(sockfd); - _free_server_thread(); + server_thread_decr(); pthread_exit((void *) 0); return NULL; } @@ -1076,7 +1073,7 @@ static void *_service_connection(void *arg) cleanup: slurm_free_msg(msg); xfree(arg); - _free_server_thread(); + server_thread_decr(); return return_code; } @@ -1123,7 +1120,8 @@ static bool _wait_for_server_thread(void) return rc; } -static void _free_server_thread(void) +/* Decrement slurmctld thread count (as applies to thread limit) */ +extern void server_thread_decr(void) { slurm_mutex_lock(&slurmctld_config.thread_count_lock); if (slurmctld_config.server_thread_count > 0) @@ -1134,7 +1132,15 @@ static void _free_server_thread(void) slurm_mutex_unlock(&slurmctld_config.thread_count_lock); } -static int _accounting_cluster_ready() +/* Increment slurmctld thread count (as applies to thread limit) */ +extern void server_thread_incr(void) +{ + slurm_mutex_lock(&slurmctld_config.thread_count_lock); + slurmctld_config.server_thread_count++; + slurm_mutex_unlock(&slurmctld_config.thread_count_lock); +} + +static int _accounting_cluster_ready(void) { int rc = SLURM_ERROR; time_t event_time = time(NULL); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 92e3697b114..1a23dd4238a 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -565,7 +565,13 @@ static void _throttle_start(int *active_rpc_cnt) (*active_rpc_cnt)++; break; } + + /* While an RPC is being throttled due to a running RPC of the + * same type, do not count that thread against the daemon's + * thread limit */ + server_thread_decr(); pthread_cond_wait(&throttle_cond, &throttle_mutex); + server_thread_incr(); } slurm_mutex_unlock(&throttle_mutex); if (LOTS_OF_AGENTS) diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index aefc4bd12df..e4e691d9b66 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1866,13 +1866,19 @@ extern void set_cluster_cpus(void); /* sends all jobs in eligible state to accounting. Only needed at * first registration */ -extern int send_jobs_to_accounting(); +extern int send_jobs_to_accounting(void); /* send all nodes in a down like state to accounting. Only needed at * first registration */ extern int send_nodes_to_accounting(time_t event_time); +/* Decrement slurmctld thread count (as applies to thread limit) */ +extern void server_thread_decr(void); + +/* Increment slurmctld thread count (as applies to thread limit) */ +extern void server_thread_incr(void); + /* Set a job's alias_list string */ extern void set_job_alias_list(struct job_record *job_ptr); -- GitLab