From 1abc72295001e173d54ba130ff31d9898fbc92d6 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 24 Jul 2003 00:48:44 +0000
Subject: [PATCH] Add deletion of SlurmctldPidFile on daemon termination.
 Explicitly kill any daemon still running when a new daemon is initiated.

---
 src/slurmctld/controller.c | 57 ++++++++++++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 2dc8598e3fd..d6f361dc498 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -49,6 +49,7 @@
 #include <slurm/slurm_errno.h>
 
 #include "src/common/daemonize.h"
+#include "src/common/fd.h"
 #include "src/common/hostlist.h"
 #include "src/common/log.h"
 #include "src/common/macros.h"
@@ -113,6 +114,8 @@ static int          _background_process_msg(slurm_msg_t * msg);
 static void *       _background_rpc_mgr(void *no_data);
 static void *       _background_signal_hand(void *no_data);
 static void         _fill_ctld_conf(slurm_ctl_conf_t * build_ptr);
+static void         _init_pidfile(void);
+static void         _kill_old_slurmctld(void);
 static int          _make_step_cred(struct step_record *step_rec, 
 				    slurm_cred_t *slurm_cred);
 static void         _parse_commandline(int argc, char *argv[], 
@@ -148,7 +151,6 @@ inline static void  _slurm_rpc_update_partition(slurm_msg_t * msg);
 static void *       _slurmctld_background(void *no_data);
 static void         _slurmctld_req(slurm_msg_t * msg);
 static void *       _slurmctld_rpc_mgr(void *no_data);
-static void         _init_pidfile(void);
 inline static int   _slurmctld_shutdown(void);
 static void *       _slurmctld_signal_hand(void *no_data);
 inline static void  _update_cred_key(void);
@@ -174,7 +176,6 @@ int main(int argc, char *argv[])
 	thread_id_main = pthread_self();
 
 	slurmctld_pid = getpid();
-	slurmctld_conf.slurm_conf = xstrdup(SLURM_CONFIG_FILE);
 	_parse_commandline(argc, argv, &slurmctld_conf);
 	init_locks();
 
@@ -187,23 +188,30 @@ int main(int argc, char *argv[])
 		setrlimit(RLIMIT_CORE, &rlim);
 	}
 
-	if ((error_code = read_slurm_conf(recover))) {
-		error("read_slurm_conf error %d reading %s",
+	/* Get SlurmctldPidFile  for _kill_old_slurmctld */
+	if ((error_code = read_slurm_conf_ctl (&slurmctld_conf))) {
+		error("read_slurm_conf_ctl error %d reading %s",
 		      error_code, SLURM_CONFIG_FILE);
 		exit(1);
 	}
-
-	if (switch_state_begin(recover)) {
-		error("switch_state_begin: %m");
+	_kill_old_slurmctld();
+	/* Now recover the remaining state information */
+	if ((error_code = read_slurm_conf(recover))) {
+		error("read_slurm_conf reading %s: %m",
+		      error_code, SLURM_CONFIG_FILE);
 		exit(1);
 	}
-
 	/* 
 	 * Need to create pidfile here in case we setuid() below
 	 * (init_pidfile() exits if it can't initialize pid file)
 	 */
 	_init_pidfile();
 
+	if (switch_state_begin(recover)) {
+		error("switch_state_begin: %m");
+		exit(1);
+	}
+
 	if ((slurmctld_conf.slurm_user_id) && 
 	    (slurmctld_conf.slurm_user_id != getuid()) &&
 	    (setuid(slurmctld_conf.slurm_user_id))) {
@@ -311,6 +319,10 @@ int main(int argc, char *argv[])
 			break;
 	}
 
+	if (unlink(slurmctld_conf.slurmctld_pidfile) < 0)
+		error("Unable to remove pidfile '%s': %m",
+		      slurmctld_conf.slurmctld_pidfile);
+
 #if	MEM_LEAK_TEST
 	/* This should purge all allocated memory,   *\
 	\*   Anything left over represents a leak.   */
@@ -2215,6 +2227,9 @@ static void _run_backup(void)
 	if (shutdown_time != 0) {
 		pthread_join(thread_id_sig, NULL);
 		info("BackupController terminating");
+		if (unlink(slurmctld_conf.slurmctld_pidfile) < 0)
+			error("Unable to remove pidfile '%s': %m",
+		     	 slurmctld_conf.slurmctld_pidfile);
 		log_fini();
 		exit(0);
 	}
@@ -2479,18 +2494,36 @@ void update_logging(void)
 		  slurmctld_conf.slurmctld_logfile);
 }
 
-static void 
+/* Kill the currently running slurmctld */
+static void
+_kill_old_slurmctld(void)
+{
+	int fd;
+	pid_t oldpid = read_pidfile(slurmctld_conf.slurmctld_pidfile, &fd);
+	if (oldpid != (pid_t) 0) {
+		info ("killing old slurmctld[%ld]", (long) oldpid);
+		kill(oldpid, SIGTERM);
+
+		/* 
+		 * Wait for previous daemon to terminate
+		 */
+		if (fd_get_readw_lock(fd) < 0) 
+			fatal ("unable to wait for readw lock: %m");
+		(void) close(fd); /* Ignore errors */ 
+	}
+}
+
+static void
 _init_pidfile(void)
 {
-	int   fd      = -1;
+	int   fd;
 	uid_t uid     = slurmctld_conf.slurm_user_id;
 
-	if ((fd = create_pidfile(slurmctld_conf.slurmctld_pidfile)) < 0) 
+	if ((fd = create_pidfile(slurmctld_conf.slurmctld_pidfile)) < 0)
 		return;
 
 	if (uid && (fchown(fd, uid, -1) < 0))
 		error ("Unable to reset owner of pidfile: %m");
-
 	/*
 	 * Close fd here, otherwise we'll deadlock since create_pidfile()
 	 * flocks the pidfile.
-- 
GitLab