From ac94747a847964c804847f0ee40e578d337424a4 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 24 Aug 2010 18:47:39 +0000
Subject: [PATCH] Add "try_lock" logic to slurmctld for background functions to
 be deferred     if required locks are not immediately available. This will
 improve     performance when heavy-weight functions such as backfill
 scheduler are     running.

---
 NEWS                       |   4 ++
 src/slurmctld/controller.c |  80 ++++++++++++------------
 src/slurmctld/locks.c      | 123 +++++++++++++++++++++++++++++--------
 src/slurmctld/locks.h      |   7 ++-
 4 files changed, 147 insertions(+), 67 deletions(-)

diff --git a/NEWS b/NEWS
index 5e8c8e5dbac..e2fe9019ae4 100644
--- a/NEWS
+++ b/NEWS
@@ -31,6 +31,10 @@ documents those changes that are of interest to users and admins.
     select/cray plugin cons_res is the default.  To use linear add 'Linear' to
     SelectTypeParameters.
  -- Fixed bug where resizing jobs didn't correctly set used limits correctly.
+ -- Add "try_lock" logic to slurmctld for background functions to be deferred
+    if required locks are not immediately available. This will improve
+    performance when heavy-weight functions such as backfill scheduler are
+    running.
 
 * Changes in SLURM 2.2.0.pre9
 =============================
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index f83bb232033..69c1f62e484 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -1217,6 +1217,7 @@ static void *_slurmctld_background(void *no_data)
 	static time_t last_assert_primary_time;
 	static time_t last_trigger;
 	static time_t last_node_acct;
+	static bool ping_msg_sent = false;
 	time_t now;
 	int no_resp_msg_interval, ping_interval, purge_job_interval;
 	int group_time, group_force;
@@ -1306,25 +1307,25 @@ static void *_slurmctld_background(void *no_data)
 			break;
 		}
 
-		if (difftime(now, last_resv_time) >= 2) {
+		if ((difftime(now, last_resv_time) >= 2) &&
+		    (try_lock_slurmctld(node_write_lock) == 0)) {
 			last_resv_time = now;
-			lock_slurmctld(node_write_lock);
 			set_node_maint_mode();
 			unlock_slurmctld(node_write_lock);
 		}
 
-		if (difftime(now, last_no_resp_msg_time) >=
-		    no_resp_msg_interval) {
+		if ((difftime(now, last_no_resp_msg_time) >=
+		     no_resp_msg_interval) &&
+		    (try_lock_slurmctld(node_write_lock2) == 0)) {
 			last_no_resp_msg_time = now;
-			lock_slurmctld(node_write_lock2);
 			node_no_resp_msg();
 			unlock_slurmctld(node_write_lock2);
 		}
 
-		if (difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) {
+		if ((difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) &&
+		    (try_lock_slurmctld(job_write_lock) == 0)) {
 			last_timelimit_time = now;
 			debug2("Testing job time limits and checkpoints");
-			lock_slurmctld(job_write_lock);
 			job_time_limit();
 			step_checkpoint();
 			unlock_slurmctld(job_write_lock);
@@ -1332,44 +1333,43 @@ static void *_slurmctld_background(void *no_data)
 
 		if (slurmctld_conf.health_check_interval &&
 		    (difftime(now, last_health_check_time) >=
-		     slurmctld_conf.health_check_interval)) {
-			if (is_ping_done()) {
-				last_health_check_time = now;
-				lock_slurmctld(node_write_lock);
-				run_health_check();
+		     slurmctld_conf.health_check_interval) &&
+		    is_ping_done() &&
+		    (try_lock_slurmctld(node_write_lock) == 0)) {
+			last_health_check_time = now;
+			run_health_check();
 #ifdef HAVE_CRAY_XT
-				basil_query();
+			basil_query();
 #endif
-				unlock_slurmctld(node_write_lock);
-			}
+			unlock_slurmctld(node_write_lock);
 		}
-		if ((difftime(now, last_ping_node_time) >= ping_interval) ||
-		    ping_nodes_now) {
-			static bool msg_sent = false;
-			if (is_ping_done()) {
-				msg_sent = false;
-				last_ping_node_time = now;
-				ping_nodes_now = false;
-				lock_slurmctld(node_write_lock);
-				ping_nodes();
-				unlock_slurmctld(node_write_lock);
-			} else if ((!msg_sent) && (!ping_nodes_now)) {
-				/* log failure once per ping_nodes() call,
-				 * no error if node state update request
-				 * processed while the ping is in progress */
-				error("Node ping apparently hung, "
-				      "many nodes may be DOWN or configured "
-				      "SlurmdTimeout should be increased");
-				msg_sent = true;
-			}
+		if (((difftime(now, last_ping_node_time) >= ping_interval) ||
+		     ping_nodes_now) &&
+		    is_ping_done() &&
+		    (try_lock_slurmctld(node_write_lock) == 0)) {
+			ping_msg_sent = false;
+			last_ping_node_time = now;
+			ping_nodes_now = false;
+			ping_nodes();
+			unlock_slurmctld(node_write_lock);
+		} else if ((difftime(now, last_ping_node_time) >=
+			    ping_interval) && !is_ping_done() && 
+			    !ping_msg_sent) {
+			/* log failure once per ping_nodes() call,
+			 * no error if node state update request
+			 * processed while the ping is in progress */
+			error("Node ping apparently hung, "
+			      "many nodes may be DOWN or configured "
+			      "SlurmdTimeout should be increased");
+			ping_msg_sent = true;
 		}
 
 		if (slurmctld_conf.inactive_limit &&
 		    (difftime(now, last_ping_srun_time) >=
-		     (slurmctld_conf.inactive_limit / 3))) {
+		     (slurmctld_conf.inactive_limit / 3)) &&
+		    (try_lock_slurmctld(job_read_lock) == 0)) {
 			last_ping_srun_time = now;
 			debug2("Performing srun ping");
-			lock_slurmctld(job_read_lock);
 			srun_ping();
 			unlock_slurmctld(job_read_lock);
 		}
@@ -1379,21 +1379,21 @@ static void *_slurmctld_background(void *no_data)
 
 		group_time  = slurmctld_conf.group_info & GROUP_TIME_MASK;
 		if (group_time &&
-		    (difftime(now, last_group_time) >= group_time)) {
+		    (difftime(now, last_group_time) >= group_time) &&
+		    (try_lock_slurmctld(part_write_lock) == 0)) {
 			if (slurmctld_conf.group_info & GROUP_FORCE)
 				group_force = 1;
 			else
 				group_force = 0;
 			last_group_time = now;
-			lock_slurmctld(part_write_lock);
 			load_part_uid_allow_list(group_force);
 			unlock_slurmctld(part_write_lock);
 		}
 
-		if (difftime(now, last_purge_job_time) >= purge_job_interval) {
+		if ((difftime(now, last_purge_job_time) >=purge_job_interval)&&
+		    (try_lock_slurmctld(job_write_lock) == 0)) {
 			last_purge_job_time = now;
 			debug2("Performing purge of old job records");
-			lock_slurmctld(job_write_lock);
 			purge_old_job();
 			unlock_slurmctld(job_write_lock);
 		}
diff --git a/src/slurmctld/locks.c b/src/slurmctld/locks.c
index c557f331b63..34ed56812f2 100644
--- a/src/slurmctld/locks.c
+++ b/src/slurmctld/locks.c
@@ -59,9 +59,9 @@ static pthread_mutex_t state_mutex = PTHREAD_MUTEX_INITIALIZER;
 static slurmctld_lock_flags_t slurmctld_locks;
 static int kill_thread = 0;
 
-static void _wr_rdlock(lock_datatype_t datatype);
+static bool _wr_rdlock(lock_datatype_t datatype, bool wait_lock);
 static void _wr_rdunlock(lock_datatype_t datatype);
-static void _wr_wrlock(lock_datatype_t datatype);
+static bool _wr_wrlock(lock_datatype_t datatype, bool wait_lock);
 static void _wr_wrunlock(lock_datatype_t datatype);
 
 /* init_locks - create locks used for slurmctld data structure access
@@ -72,34 +72,97 @@ void init_locks(void)
 	memset((void *) &slurmctld_locks, 0, sizeof(slurmctld_locks));
 }
 
-/* lock_slurmctld - Issue the required lock requests in a well defined order
- * RET 0 on success, -1 on failure */
-void lock_slurmctld(slurmctld_lock_t lock_levels)
+/* lock_slurmctld - Issue the required lock requests in a well defined order */
+extern void lock_slurmctld(slurmctld_lock_t lock_levels)
 {
 	if (lock_levels.config == READ_LOCK)
-		_wr_rdlock(CONFIG_LOCK);
+		(void) _wr_rdlock(CONFIG_LOCK, true);
 	else if (lock_levels.config == WRITE_LOCK)
-		_wr_wrlock(CONFIG_LOCK);
+		(void) _wr_wrlock(CONFIG_LOCK, true);
 
 	if (lock_levels.job == READ_LOCK)
-		_wr_rdlock(JOB_LOCK);
+		(void) _wr_rdlock(JOB_LOCK, true);
 	else if (lock_levels.job == WRITE_LOCK)
-		_wr_wrlock(JOB_LOCK);
+		(void) _wr_wrlock(JOB_LOCK, true);
 
 	if (lock_levels.node == READ_LOCK)
-		_wr_rdlock(NODE_LOCK);
+		(void) _wr_rdlock(NODE_LOCK, true);
 	else if (lock_levels.node == WRITE_LOCK)
-		_wr_wrlock(NODE_LOCK);
+		(void) _wr_wrlock(NODE_LOCK, true);
 
 	if (lock_levels.partition == READ_LOCK)
-		_wr_rdlock(PART_LOCK);
+		(void) _wr_rdlock(PART_LOCK, true);
 	else if (lock_levels.partition == WRITE_LOCK)
-		_wr_wrlock(PART_LOCK);
+		(void) _wr_wrlock(PART_LOCK, true);
+}
+
+/* try_lock_slurmctld - equivalent to lock_slurmctld() except 
+ * RET 0 on success or -1 if the locks are currently not available */
+extern int try_lock_slurmctld (slurmctld_lock_t lock_levels)
+{
+	bool success = true;
+
+	if (lock_levels.config == READ_LOCK)
+		success = _wr_rdlock(CONFIG_LOCK, false);
+	else if (lock_levels.config == WRITE_LOCK)
+		success = _wr_wrlock(CONFIG_LOCK, false);
+	if (!success)
+		return -1;
+		
+	if (lock_levels.job == READ_LOCK)
+		success = _wr_rdlock(JOB_LOCK, false);
+	else if (lock_levels.job == WRITE_LOCK)
+		success = _wr_wrlock(JOB_LOCK, false);
+	if (!success) {
+		if (lock_levels.config == READ_LOCK)
+			_wr_rdunlock(CONFIG_LOCK);
+		else if (lock_levels.config == WRITE_LOCK)
+			_wr_wrunlock(CONFIG_LOCK);
+		return -1;
+	}
+
+	if (lock_levels.node == READ_LOCK)
+		success = _wr_rdlock(NODE_LOCK, false);
+	else if (lock_levels.node == WRITE_LOCK)
+		success = _wr_wrlock(NODE_LOCK, false);
+	if (!success) {
+		if (lock_levels.job == READ_LOCK)
+			_wr_rdunlock(JOB_LOCK);
+		else if (lock_levels.job == WRITE_LOCK)
+			_wr_wrunlock(JOB_LOCK);
+		if (lock_levels.config == READ_LOCK)
+			_wr_rdunlock(CONFIG_LOCK);
+		else if (lock_levels.config == WRITE_LOCK)
+			_wr_wrunlock(CONFIG_LOCK);
+		return -1;
+	}
+
+	if (lock_levels.partition == READ_LOCK)
+		success = _wr_rdlock(PART_LOCK, false);
+	else if (lock_levels.partition == WRITE_LOCK)
+		success = _wr_wrlock(PART_LOCK, false);
+	if (!success) {
+		if (lock_levels.node == READ_LOCK)
+			_wr_rdunlock(NODE_LOCK);
+		else if (lock_levels.node == WRITE_LOCK)
+			_wr_wrunlock(NODE_LOCK);
+		if (lock_levels.job == READ_LOCK)
+			_wr_rdunlock(JOB_LOCK);
+		else if (lock_levels.job == WRITE_LOCK)
+			_wr_wrunlock(JOB_LOCK);
+		if (lock_levels.config == READ_LOCK)
+			_wr_rdunlock(CONFIG_LOCK);
+		else if (lock_levels.config == WRITE_LOCK)
+			_wr_wrunlock(CONFIG_LOCK);
+		return -1;
+	}
+
+	return 0;
 }
 
 /* unlock_slurmctld - Issue the required unlock requests in a well
  *	defined order */
-void unlock_slurmctld(slurmctld_lock_t lock_levels)
+extern void unlock_slurmctld(slurmctld_lock_t lock_levels)
 {
 	if (lock_levels.partition == READ_LOCK)
 		_wr_rdunlock(PART_LOCK);
@@ -123,16 +186,19 @@ void unlock_slurmctld(slurmctld_lock_t lock_levels)
 }
 
 /* _wr_rdlock - Issue a read lock on the specified data type */
-static void _wr_rdlock(lock_datatype_t datatype)
+static bool _wr_rdlock(lock_datatype_t datatype, bool wait_lock)
 {
+	bool success = true;
+
 	slurm_mutex_lock(&locks_mutex);
 	while (1) {
-		if ((slurmctld_locks.entity[write_wait_lock(datatype)] ==
-		     0)
-		    && (slurmctld_locks.entity[write_lock(datatype)] ==
-			0)) {
+		if ((slurmctld_locks.entity[write_wait_lock(datatype)] == 0) &&
+		    (slurmctld_locks.entity[write_lock(datatype)] == 0)) {
 			slurmctld_locks.entity[read_lock(datatype)]++;
 			break;
+		} else if (!wait_lock) {
+			success = false;
+			break;
 		} else {	/* wait for state change and retry */
 			pthread_cond_wait(&locks_cond, &locks_mutex);
 			if (kill_thread)
@@ -140,6 +206,7 @@ static void _wr_rdlock(lock_datatype_t datatype)
 		}
 	}
 	slurm_mutex_unlock(&locks_mutex);
+	return success;
 }
 
 /* _wr_rdunlock - Issue a read unlock on the specified data type */
@@ -152,8 +219,10 @@ static void _wr_rdunlock(lock_datatype_t datatype)
 }
 
 /* _wr_wrlock - Issue a write lock on the specified data type */
-static void _wr_wrlock(lock_datatype_t datatype)
+static bool _wr_wrlock(lock_datatype_t datatype, bool wait_lock)
 {
+	bool success = true;
+
 	slurm_mutex_lock(&locks_mutex);
 	slurmctld_locks.entity[write_wait_lock(datatype)]++;
 
@@ -161,8 +230,11 @@ static void _wr_wrlock(lock_datatype_t datatype)
 		if ((slurmctld_locks.entity[read_lock(datatype)] == 0) &&
 		    (slurmctld_locks.entity[write_lock(datatype)] == 0)) {
 			slurmctld_locks.entity[write_lock(datatype)]++;
-			slurmctld_locks.
-			    entity[write_wait_lock(datatype)]--;
+			slurmctld_locks.entity[write_wait_lock(datatype)]--;
+			break;
+		} else if (!wait_lock) {
+			slurmctld_locks.entity[write_wait_lock(datatype)]--;
+			success = false;
 			break;
 		} else {	/* wait for state change and retry */
 			pthread_cond_wait(&locks_cond, &locks_mutex);
@@ -171,6 +243,7 @@ static void _wr_wrlock(lock_datatype_t datatype)
 		}
 	}
 	slurm_mutex_unlock(&locks_mutex);
+	return success;
 }
 
 /* _wr_wrunlock - Issue a write unlock on the specified data type */
@@ -192,18 +265,18 @@ void get_lock_values(slurmctld_lock_flags_t * lock_flags)
 }
 
 /* kill_locked_threads - Kill all threads waiting on semaphores */
-void kill_locked_threads(void)
+extern void kill_locked_threads(void)
 {
 	kill_thread = 1;
 	pthread_cond_broadcast(&locks_cond);
 }
 
 /* un/lock semaphore used for saving state of slurmctld */
-void lock_state_files(void)
+extern void lock_state_files(void)
 {
 	slurm_mutex_lock(&state_mutex);
 }
-void unlock_state_files(void)
+extern void unlock_state_files(void)
 {
 	slurm_mutex_unlock(&state_mutex);
 }
diff --git a/src/slurmctld/locks.h b/src/slurmctld/locks.h
index ff99e3014d6..ff287db8627 100644
--- a/src/slurmctld/locks.h
+++ b/src/slurmctld/locks.h
@@ -136,10 +136,13 @@ extern void init_locks ( void );
 /* kill_locked_threads - Kill all threads waiting on semaphores */
 extern void kill_locked_threads ( void );
 
-/* lock_slurmctld - Issue the required lock requests in a well defined order
- * RET 0 on success, -1 on failure */
+/* lock_slurmctld - Issue the required lock requests in a well defined order */
 extern void lock_slurmctld (slurmctld_lock_t lock_levels);
 
+/* try_lock_slurmctld - equivalent to lock_slurmctld() except 
+ * RET 0 on success or -1 if the locks are currently not available */
+extern int try_lock_slurmctld (slurmctld_lock_t lock_levels);
+
 /* unlock_slurmctld - Issue the required unlock requests in a well
  *	defined order */
 extern void unlock_slurmctld (slurmctld_lock_t lock_levels);
-- 
GitLab