From f3bce2df80e2373119e76e9916f9eb1668e2b908 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Sun, 21 Sep 2003 19:23:44 +0000
Subject: [PATCH] Parameterize the maximum time for the backup controller to
 reliquish control (it needs to complete all pending RPCs and save state
 before the primary reads state and takes over).

---
 src/slurmctld/controller.c | 11 +++++------
 src/slurmctld/proc_req.c   | 12 +++++++-----
 src/slurmctld/slurmctld.h  |  3 +++
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 4debd7cc745..8ffbd3e5f2f 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -601,14 +601,13 @@ static void *_slurmctld_background(void *no_data)
 		if (slurmctld_config.shutdown_time) {
 			int i;
 			/* wait for RPC's to complete */
-			for (i=0; ((i<2) && slurmctld_config.
-					server_thread_count); i++) {
-				debug2("server_thread_count=%d",
-					slurmctld_config.server_thread_count);
+			for (i = 1; i < CONTROL_TIMEOUT; i++) {
+				if (slurmctld_config.server_thread_count == 0)
+					break;
 				sleep(1);
 			}
 			if (slurmctld_config.server_thread_count)
-				info("shutdown server_thread_count %d", 
+				info("shutdown server_thread_count=%d", 
 					slurmctld_config.server_thread_count);
 			if (_report_locks_set() == 0)
 				save_all_state();
@@ -883,7 +882,7 @@ static int _shutdown_backup_controller(void)
 	req.msg_type = REQUEST_CONTROL;
 	req.data = NULL;
 
-	if (slurm_send_recv_rc_msg(&req, &rc, 0) < 0) {
+	if (slurm_send_recv_rc_msg(&req, &rc, CONTROL_TIMEOUT) < 0) {
 		error("shutdown_backup:send/recv: %m");
 		return SLURM_SOCKET_ERROR;
 	}
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 81b29152739..630e0ba5fda 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -1271,15 +1271,17 @@ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg)
 	}
 
 	if (msg->msg_type == REQUEST_CONTROL) {
-		/* wait for workload to dry up before sending reply */
-		for (i = 0; ((i < 10) && (slurmctld_config.
-				server_thread_count > 1)); i++) {
+		/* Wait for workload to dry up before sending reply.
+		 * One thread should remain, this one. */
+		for (i = 1; i < CONTROL_TIMEOUT; i++) {
+			if (slurmctld_config.server_thread_count <= 1)
+				break;
 			sleep(1);
 		}
 		if (slurmctld_config.server_thread_count > 1)
-			error("shutting down with server_thread_count=%d",
+			error("REQUEST_CONTROL reply with %d active threads",
 				slurmctld_config.server_thread_count);
-		save_all_state();
+		/* save_all_state();	performed by _slurmctld_background */
 	}
 	slurm_send_rc_msg(msg, error_code);
 	if ((error_code == SLURM_SUCCESS) && core_arg)
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index bb2078c611b..ee221cd2c63 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -101,6 +101,9 @@
  * Update the group uid_t access list as needed */
 #define	PERIODIC_GROUP_CHECK	600
 
+/* Seconds to wait for backup controller response to REQUEST_CONTROL RPC */
+#define CONTROL_TIMEOUT 4
+
 /* Default configuration configuration file values */
 #define DEFAULT_FAST_SCHEDULE       1
 #define DEFAULT_FIRST_JOB_ID        1
-- 
GitLab