From 55bcf1d1dde1e2fe375bd3bf806c8ca7a935a34f Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 9 Jun 2003 20:29:19 +0000
Subject: [PATCH] On slurmctld shutdown call qsw_fini() and save state. On
 slurmctld startup recover qsw state saved (if any and if "-c" option not
 used) and use as argument to qsw_init(). If no state to be preserved, call
 qsw_init(NULL) to initialize data structures.

---
 src/slurmctld/controller.c  |   7 +++
 src/slurmctld/read_config.c | 111 ++++++++++++++++++++++++++++++++++++
 src/slurmctld/slurmctld.h   |  13 +++++
 3 files changed, 131 insertions(+)

diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 598c056f76c..8b06b68e8b0 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -194,6 +194,11 @@ int main(int argc, char *argv[])
 		exit(1);
 	}
 
+	if (switch_state_begin(recover)) {
+		error("switch_state_begin: %m");
+		exit(1);
+	}
+
 	/* 
 	 * Need to create pidfile here in case we setuid() below
 	 * (init_pidfile() exits if it can't initialize pid file)
@@ -354,6 +359,7 @@ static void *_slurmctld_signal_hand(void *no_data)
 			/* send REQUEST_SHUTDOWN_IMMEDIATE RPC */
 			_slurmctld_shutdown();
 			pthread_join(thread_id_rpc, NULL);
+			switch_state_fini();
 			return NULL;	/* Normal termination */
 			break;
 		case SIGHUP:	/* kill -1 */
@@ -626,6 +632,7 @@ static void *_slurmctld_background(void *no_data)
 	return NULL;
 }
 
+
 /* _save_all_state - save entire slurmctld state for later recovery */
 static void _save_all_state(void)
 {
diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c
index 83dbc83d78e..a2c0ab4a363 100644
--- a/src/slurmctld/read_config.c
+++ b/src/slurmctld/read_config.c
@@ -30,6 +30,7 @@
 
 #include <ctype.h>
 #include <errno.h>
+#include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -972,3 +973,113 @@ static void _validate_node_proc_count(void)
 	list_iterator_destroy(part_record_iterator);
 }
 #endif
+
+/*
+ * switch_state_begin - Recover or initialize switch state
+ * IN recover - If set, recover switch state as previously saved
+ * RET 0 if no error, otherwise an error code
+ */ 
+int switch_state_begin(int recover)
+{
+	int error_code = SLURM_SUCCESS;
+#ifdef HAVE_LIBELAN3
+	qsw_libstate_t old_state = NULL;
+	Buf buffer = NULL;
+	char *qsw_state_file = NULL, *data = NULL;
+	int state_fd, data_allocated, data_read= 0, data_size = 0;
+
+	if (recover) {
+		/* Read state from file into buffer */
+		qsw_state_file = xstrdup (slurmctld_conf.state_save_location);
+		xstrcat (qsw_state_file, "/qsw_state");
+		state_fd = open (qsw_state_file, O_RDONLY);
+		if (state_fd >= 0) {
+			data_allocated = BUF_SIZE;
+			data = xmalloc(data_allocated);
+			while ((data_read = 
+					read (state_fd, &data[data_size], 
+					BUF_SIZE)) == BUF_SIZE) {
+				data_size += data_read;
+				data_allocated += BUF_SIZE;
+				xrealloc(data, data_allocated);
+			}
+			data_size += data_read;
+			if (data_read < 0) {
+				error ("Read error on %s, %m", qsw_state_file);
+				error_code = SLURM_ERROR;
+				data_size = 0;
+			}
+			close (state_fd);
+		} else
+			info("No %s file to recover QSW state from", 
+				qsw_state_file);
+		xfree(qsw_state_file);
+
+		if ((error_code == SLURM_SUCCESS) && data_size) {
+			if (qsw_alloc_libstate(&old_state)) {
+				error_code = SLURM_ERROR;
+			} else {
+				buffer = create_buf (data, data_size);
+				if (qsw_unpack_libstate(old_state, buffer) < 0)
+					error_code = errno;
+			}
+		}
+		if (buffer)
+			free_buf(buffer);
+		else if (data)
+			xfree(data);
+
+	}
+	if (error_code == SLURM_SUCCESS)
+		error_code = qsw_init(old_state);
+	if (old_state)
+		qsw_free_libstate(old_state);
+#endif				/* HAVE_LIBELAN3 */
+	return error_code;
+}
+
+/*
+ * switch_state_fini - save switch state and shutdown switch
+ * RET 0 if no error, otherwise an error code
+ */ 
+int switch_state_fini(void)
+{
+	int error_code = SLURM_SUCCESS;
+#ifdef HAVE_LIBELAN3
+	qsw_libstate_t old_state = NULL;
+	Buf buffer = NULL;
+	char *qsw_state_file = NULL;
+	int state_fd;
+
+	if (qsw_alloc_libstate(&old_state))
+		return errno;
+	qsw_fini(old_state);
+	buffer = init_buf(1024);
+	error_code = qsw_pack_libstate(old_state, buffer);
+	qsw_state_file = xstrdup (slurmctld_conf.state_save_location);
+	xstrcat (qsw_state_file, "/qsw_state");
+	(void) unlink (qsw_state_file);
+	state_fd = creat (qsw_state_file, 0600);
+	if (state_fd == 0) {
+		error ("Can't save state, error creating file %s %m", 
+		       qsw_state_file);
+		error_code = errno;
+	}
+	else {
+		if (write (state_fd, get_buf_data(buffer), 
+				get_buf_offset(buffer)) != 
+				get_buf_offset(buffer)) {
+			error ("Can't save state, error writing file %s %m", 
+			       qsw_state_file);
+			error_code = errno;
+		}
+		close (state_fd);
+	}
+	xfree (qsw_state_file);
+	if (buffer)
+		free_buf(buffer);
+	if (old_state)
+		qsw_free_libstate(old_state);
+#endif				/* HAVE_LIBELAN3 */
+	return error_code;
+}
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 5500af41008..6a9b0cd0c5e 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -923,6 +923,19 @@ extern int step_create ( step_specs *step_specs,
 extern bool step_on_node(struct job_record  *job_ptr, 
 			 struct node_record *node_ptr);
 
+/*
+ * switch_state_fini - save switch state and shutdown switch
+ * RET 0 if no error, otherwise an error code
+ */ 
+extern int switch_state_fini(void);
+
+/*
+ * switch_state_begin - Recover or initialize switch state
+ * IN recover - If set, recover switch state as previously saved
+ * RET 0 if no error, otherwise an error code
+ */ 
+extern int switch_state_begin(int recover);
+
 /*
  * Synchronize the batch job in the system with their files.
  * All pending batch jobs must have script and environment files
-- 
GitLab