From ac94747a847964c804847f0ee40e578d337424a4 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 24 Aug 2010 18:47:39 +0000 Subject: [PATCH] Add "try_lock" logic to slurmctld for background functions to be deferred if required locks are not immediately available. This will improve performance when heavy-weight functions such as backfill scheduler are running. --- NEWS | 4 ++ src/slurmctld/controller.c | 80 ++++++++++++------------ src/slurmctld/locks.c | 123 +++++++++++++++++++++++++++++-------- src/slurmctld/locks.h | 7 ++- 4 files changed, 147 insertions(+), 67 deletions(-) diff --git a/NEWS b/NEWS index 5e8c8e5dbac..e2fe9019ae4 100644 --- a/NEWS +++ b/NEWS @@ -31,6 +31,10 @@ documents those changes that are of interest to users and admins. select/cray plugin cons_res is the default. To use linear add 'Linear' to SelectTypeParameters. -- Fixed bug where resizing jobs didn't correctly set used limits correctly. + -- Add "try_lock" logic to slurmctld for background functions to be deferred + if required locks are not immediately available. This will improve + performance when heavy-weight functions such as backfill scheduler are + running. * Changes in SLURM 2.2.0.pre9 ============================= diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index f83bb232033..69c1f62e484 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1217,6 +1217,7 @@ static void *_slurmctld_background(void *no_data) static time_t last_assert_primary_time; static time_t last_trigger; static time_t last_node_acct; + static bool ping_msg_sent = false; time_t now; int no_resp_msg_interval, ping_interval, purge_job_interval; int group_time, group_force; @@ -1306,25 +1307,25 @@ static void *_slurmctld_background(void *no_data) break; } - if (difftime(now, last_resv_time) >= 2) { + if ((difftime(now, last_resv_time) >= 2) && + (try_lock_slurmctld(node_write_lock) == 0)) { last_resv_time = now; - lock_slurmctld(node_write_lock); set_node_maint_mode(); unlock_slurmctld(node_write_lock); } - if (difftime(now, last_no_resp_msg_time) >= - no_resp_msg_interval) { + if ((difftime(now, last_no_resp_msg_time) >= + no_resp_msg_interval) && + (try_lock_slurmctld(node_write_lock2) == 0)) { last_no_resp_msg_time = now; - lock_slurmctld(node_write_lock2); node_no_resp_msg(); unlock_slurmctld(node_write_lock2); } - if (difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) { + if ((difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) && + (try_lock_slurmctld(job_write_lock) == 0)) { last_timelimit_time = now; debug2("Testing job time limits and checkpoints"); - lock_slurmctld(job_write_lock); job_time_limit(); step_checkpoint(); unlock_slurmctld(job_write_lock); @@ -1332,44 +1333,43 @@ static void *_slurmctld_background(void *no_data) if (slurmctld_conf.health_check_interval && (difftime(now, last_health_check_time) >= - slurmctld_conf.health_check_interval)) { - if (is_ping_done()) { - last_health_check_time = now; - lock_slurmctld(node_write_lock); - run_health_check(); + slurmctld_conf.health_check_interval) && + is_ping_done() && + (try_lock_slurmctld(node_write_lock) == 0)) { + last_health_check_time = now; + run_health_check(); #ifdef HAVE_CRAY_XT - basil_query(); + basil_query(); #endif - unlock_slurmctld(node_write_lock); - } + unlock_slurmctld(node_write_lock); } - if ((difftime(now, last_ping_node_time) >= ping_interval) || - ping_nodes_now) { - static bool msg_sent = false; - if (is_ping_done()) { - msg_sent = false; - last_ping_node_time = now; - ping_nodes_now = false; - lock_slurmctld(node_write_lock); - ping_nodes(); - unlock_slurmctld(node_write_lock); - } else if ((!msg_sent) && (!ping_nodes_now)) { - /* log failure once per ping_nodes() call, - * no error if node state update request - * processed while the ping is in progress */ - error("Node ping apparently hung, " - "many nodes may be DOWN or configured " - "SlurmdTimeout should be increased"); - msg_sent = true; - } + if (((difftime(now, last_ping_node_time) >= ping_interval) || + ping_nodes_now) && + is_ping_done() && + (try_lock_slurmctld(node_write_lock) == 0)) { + ping_msg_sent = false; + last_ping_node_time = now; + ping_nodes_now = false; + ping_nodes(); + unlock_slurmctld(node_write_lock); + } else if ((difftime(now, last_ping_node_time) >= + ping_interval) && !is_ping_done() && + !ping_msg_sent) { + /* log failure once per ping_nodes() call, + * no error if node state update request + * processed while the ping is in progress */ + error("Node ping apparently hung, " + "many nodes may be DOWN or configured " + "SlurmdTimeout should be increased"); + ping_msg_sent = true; } if (slurmctld_conf.inactive_limit && (difftime(now, last_ping_srun_time) >= - (slurmctld_conf.inactive_limit / 3))) { + (slurmctld_conf.inactive_limit / 3)) && + (try_lock_slurmctld(job_read_lock) == 0)) { last_ping_srun_time = now; debug2("Performing srun ping"); - lock_slurmctld(job_read_lock); srun_ping(); unlock_slurmctld(job_read_lock); } @@ -1379,21 +1379,21 @@ static void *_slurmctld_background(void *no_data) group_time = slurmctld_conf.group_info & GROUP_TIME_MASK; if (group_time && - (difftime(now, last_group_time) >= group_time)) { + (difftime(now, last_group_time) >= group_time) && + (try_lock_slurmctld(part_write_lock) == 0)) { if (slurmctld_conf.group_info & GROUP_FORCE) group_force = 1; else group_force = 0; last_group_time = now; - lock_slurmctld(part_write_lock); load_part_uid_allow_list(group_force); unlock_slurmctld(part_write_lock); } - if (difftime(now, last_purge_job_time) >= purge_job_interval) { + if ((difftime(now, last_purge_job_time) >=purge_job_interval)&& + (try_lock_slurmctld(job_write_lock) == 0)) { last_purge_job_time = now; debug2("Performing purge of old job records"); - lock_slurmctld(job_write_lock); purge_old_job(); unlock_slurmctld(job_write_lock); } diff --git a/src/slurmctld/locks.c b/src/slurmctld/locks.c index c557f331b63..34ed56812f2 100644 --- a/src/slurmctld/locks.c +++ b/src/slurmctld/locks.c @@ -59,9 +59,9 @@ static pthread_mutex_t state_mutex = PTHREAD_MUTEX_INITIALIZER; static slurmctld_lock_flags_t slurmctld_locks; static int kill_thread = 0; -static void _wr_rdlock(lock_datatype_t datatype); +static bool _wr_rdlock(lock_datatype_t datatype, bool wait_lock); static void _wr_rdunlock(lock_datatype_t datatype); -static void _wr_wrlock(lock_datatype_t datatype); +static bool _wr_wrlock(lock_datatype_t datatype, bool wait_lock); static void _wr_wrunlock(lock_datatype_t datatype); /* init_locks - create locks used for slurmctld data structure access @@ -72,34 +72,97 @@ void init_locks(void) memset((void *) &slurmctld_locks, 0, sizeof(slurmctld_locks)); } -/* lock_slurmctld - Issue the required lock requests in a well defined order - * RET 0 on success, -1 on failure */ -void lock_slurmctld(slurmctld_lock_t lock_levels) +/* lock_slurmctld - Issue the required lock requests in a well defined order */ +extern void lock_slurmctld(slurmctld_lock_t lock_levels) { if (lock_levels.config == READ_LOCK) - _wr_rdlock(CONFIG_LOCK); + (void) _wr_rdlock(CONFIG_LOCK, true); else if (lock_levels.config == WRITE_LOCK) - _wr_wrlock(CONFIG_LOCK); + (void) _wr_wrlock(CONFIG_LOCK, true); if (lock_levels.job == READ_LOCK) - _wr_rdlock(JOB_LOCK); + (void) _wr_rdlock(JOB_LOCK, true); else if (lock_levels.job == WRITE_LOCK) - _wr_wrlock(JOB_LOCK); + (void) _wr_wrlock(JOB_LOCK, true); if (lock_levels.node == READ_LOCK) - _wr_rdlock(NODE_LOCK); + (void) _wr_rdlock(NODE_LOCK, true); else if (lock_levels.node == WRITE_LOCK) - _wr_wrlock(NODE_LOCK); + (void) _wr_wrlock(NODE_LOCK, true); if (lock_levels.partition == READ_LOCK) - _wr_rdlock(PART_LOCK); + (void) _wr_rdlock(PART_LOCK, true); else if (lock_levels.partition == WRITE_LOCK) - _wr_wrlock(PART_LOCK); + (void) _wr_wrlock(PART_LOCK, true); +} + +/* try_lock_slurmctld - equivalent to lock_slurmctld() except + * RET 0 on success or -1 if the locks are currently not available */ +extern int try_lock_slurmctld (slurmctld_lock_t lock_levels) +{ + bool success = true; + + if (lock_levels.config == READ_LOCK) + success = _wr_rdlock(CONFIG_LOCK, false); + else if (lock_levels.config == WRITE_LOCK) + success = _wr_wrlock(CONFIG_LOCK, false); + if (!success) + return -1; + + if (lock_levels.job == READ_LOCK) + success = _wr_rdlock(JOB_LOCK, false); + else if (lock_levels.job == WRITE_LOCK) + success = _wr_wrlock(JOB_LOCK, false); + if (!success) { + if (lock_levels.config == READ_LOCK) + _wr_rdunlock(CONFIG_LOCK); + else if (lock_levels.config == WRITE_LOCK) + _wr_wrunlock(CONFIG_LOCK); + return -1; + } + + if (lock_levels.node == READ_LOCK) + success = _wr_rdlock(NODE_LOCK, false); + else if (lock_levels.node == WRITE_LOCK) + success = _wr_wrlock(NODE_LOCK, false); + if (!success) { + if (lock_levels.job == READ_LOCK) + _wr_rdunlock(JOB_LOCK); + else if (lock_levels.job == WRITE_LOCK) + _wr_wrunlock(JOB_LOCK); + if (lock_levels.config == READ_LOCK) + _wr_rdunlock(CONFIG_LOCK); + else if (lock_levels.config == WRITE_LOCK) + _wr_wrunlock(CONFIG_LOCK); + return -1; + } + + if (lock_levels.partition == READ_LOCK) + success = _wr_rdlock(PART_LOCK, false); + else if (lock_levels.partition == WRITE_LOCK) + success = _wr_wrlock(PART_LOCK, false); + if (!success) { + if (lock_levels.node == READ_LOCK) + _wr_rdunlock(NODE_LOCK); + else if (lock_levels.node == WRITE_LOCK) + _wr_wrunlock(NODE_LOCK); + if (lock_levels.job == READ_LOCK) + _wr_rdunlock(JOB_LOCK); + else if (lock_levels.job == WRITE_LOCK) + _wr_wrunlock(JOB_LOCK); + if (lock_levels.config == READ_LOCK) + _wr_rdunlock(CONFIG_LOCK); + else if (lock_levels.config == WRITE_LOCK) + _wr_wrunlock(CONFIG_LOCK); + return -1; + } + + return 0; } /* unlock_slurmctld - Issue the required unlock requests in a well * defined order */ -void unlock_slurmctld(slurmctld_lock_t lock_levels) +extern void unlock_slurmctld(slurmctld_lock_t lock_levels) { if (lock_levels.partition == READ_LOCK) _wr_rdunlock(PART_LOCK); @@ -123,16 +186,19 @@ void unlock_slurmctld(slurmctld_lock_t lock_levels) } /* _wr_rdlock - Issue a read lock on the specified data type */ -static void _wr_rdlock(lock_datatype_t datatype) +static bool _wr_rdlock(lock_datatype_t datatype, bool wait_lock) { + bool success = true; + slurm_mutex_lock(&locks_mutex); while (1) { - if ((slurmctld_locks.entity[write_wait_lock(datatype)] == - 0) - && (slurmctld_locks.entity[write_lock(datatype)] == - 0)) { + if ((slurmctld_locks.entity[write_wait_lock(datatype)] == 0) && + (slurmctld_locks.entity[write_lock(datatype)] == 0)) { slurmctld_locks.entity[read_lock(datatype)]++; break; + } else if (!wait_lock) { + success = false; + break; } else { /* wait for state change and retry */ pthread_cond_wait(&locks_cond, &locks_mutex); if (kill_thread) @@ -140,6 +206,7 @@ static void _wr_rdlock(lock_datatype_t datatype) } } slurm_mutex_unlock(&locks_mutex); + return success; } /* _wr_rdunlock - Issue a read unlock on the specified data type */ @@ -152,8 +219,10 @@ static void _wr_rdunlock(lock_datatype_t datatype) } /* _wr_wrlock - Issue a write lock on the specified data type */ -static void _wr_wrlock(lock_datatype_t datatype) +static bool _wr_wrlock(lock_datatype_t datatype, bool wait_lock) { + bool success = true; + slurm_mutex_lock(&locks_mutex); slurmctld_locks.entity[write_wait_lock(datatype)]++; @@ -161,8 +230,11 @@ static void _wr_wrlock(lock_datatype_t datatype) if ((slurmctld_locks.entity[read_lock(datatype)] == 0) && (slurmctld_locks.entity[write_lock(datatype)] == 0)) { slurmctld_locks.entity[write_lock(datatype)]++; - slurmctld_locks. - entity[write_wait_lock(datatype)]--; + slurmctld_locks.entity[write_wait_lock(datatype)]--; + break; + } else if (!wait_lock) { + slurmctld_locks.entity[write_wait_lock(datatype)]--; + success = false; break; } else { /* wait for state change and retry */ pthread_cond_wait(&locks_cond, &locks_mutex); @@ -171,6 +243,7 @@ static void _wr_wrlock(lock_datatype_t datatype) } } slurm_mutex_unlock(&locks_mutex); + return success; } /* _wr_wrunlock - Issue a write unlock on the specified data type */ @@ -192,18 +265,18 @@ void get_lock_values(slurmctld_lock_flags_t * lock_flags) } /* kill_locked_threads - Kill all threads waiting on semaphores */ -void kill_locked_threads(void) +extern void kill_locked_threads(void) { kill_thread = 1; pthread_cond_broadcast(&locks_cond); } /* un/lock semaphore used for saving state of slurmctld */ -void lock_state_files(void) +extern void lock_state_files(void) { slurm_mutex_lock(&state_mutex); } -void unlock_state_files(void) +extern void unlock_state_files(void) { slurm_mutex_unlock(&state_mutex); } diff --git a/src/slurmctld/locks.h b/src/slurmctld/locks.h index ff99e3014d6..ff287db8627 100644 --- a/src/slurmctld/locks.h +++ b/src/slurmctld/locks.h @@ -136,10 +136,13 @@ extern void init_locks ( void ); /* kill_locked_threads - Kill all threads waiting on semaphores */ extern void kill_locked_threads ( void ); -/* lock_slurmctld - Issue the required lock requests in a well defined order - * RET 0 on success, -1 on failure */ +/* lock_slurmctld - Issue the required lock requests in a well defined order */ extern void lock_slurmctld (slurmctld_lock_t lock_levels); +/* try_lock_slurmctld - equivalent to lock_slurmctld() except + * RET 0 on success or -1 if the locks are currently not available */ +extern int try_lock_slurmctld (slurmctld_lock_t lock_levels); + /* unlock_slurmctld - Issue the required unlock requests in a well * defined order */ extern void unlock_slurmctld (slurmctld_lock_t lock_levels); -- GitLab