From cfc6ca6cb375a4ed47eda430f84608f4662ab4be Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 1 Jun 2007 17:52:14 +0000
Subject: [PATCH] Add socket re-open logic to retry

---
 src/plugins/sched/wiki/msg.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/plugins/sched/wiki/msg.c b/src/plugins/sched/wiki/msg.c
index 7c92b0d28d6..01e0c5fb43a 100644
--- a/src/plugins/sched/wiki/msg.c
+++ b/src/plugins/sched/wiki/msg.c
@@ -134,17 +134,27 @@ extern void term_msg_thread(void)
 \*****************************************************************************/
 static void *_msg_thread(void *no_data)
 {
-	slurm_fd sock_fd, new_fd;
+	slurm_fd sock_fd = -1, new_fd;
 	slurm_addr cli_addr;
 	char *msg;
 	slurm_ctl_conf_t *conf = slurm_conf_lock();
+	int i;
 
 	sched_port = conf->schedport;
 	slurm_conf_unlock();
-	if ((sock_fd = slurm_init_msg_engine_port(sched_port)) 
-			== SLURM_SOCKET_ERROR) {
-		fatal("wiki: slurm_init_msg_engine_port %u %m",
+
+	/* If SchedulerPort is already taken, keep trying to open it
+	 * once per minute. Slurmctld will continue to function
+	 * during this interval even if nothing can be scheduled. */
+	for (i=0; (!thread_shutdown); i++) {
+		if (i > 0)
+			sleep(60);
+		sock_fd = slurm_init_msg_engine_port(sched_port);
+		if (sock_fd != SLURM_SOCKET_ERROR)
+			break;
+		error("wiki: slurm_init_msg_engine_port %u %m",
 			sched_port);
+		error("wiki: Unable to communicate with Moab");
 	}
 
 	/* Process incoming RPCs until told to shutdown */
@@ -170,7 +180,8 @@ static void *_msg_thread(void *no_data)
 		xfree(msg);
 		slurm_close_accepted_conn(new_fd);
 	}
-	(void) slurm_shutdown_msg_engine(sock_fd);
+	if (sock_fd > 0)
+		(void) slurm_shutdown_msg_engine(sock_fd);
 	pthread_exit((void *) 0);
 	return NULL;
 }
-- 
GitLab