From 55bcf1d1dde1e2fe375bd3bf806c8ca7a935a34f Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 9 Jun 2003 20:29:19 +0000 Subject: [PATCH] On slurmctld shutdown call qsw_fini() and save state. On slurmctld startup recover qsw state saved (if any and if "-c" option not used) and use as argument to qsw_init(). If no state to be preserved, call qsw_init(NULL) to initialize data structures. --- src/slurmctld/controller.c | 7 +++ src/slurmctld/read_config.c | 111 ++++++++++++++++++++++++++++++++++++ src/slurmctld/slurmctld.h | 13 +++++ 3 files changed, 131 insertions(+) diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 598c056f76c..8b06b68e8b0 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -194,6 +194,11 @@ int main(int argc, char *argv[]) exit(1); } + if (switch_state_begin(recover)) { + error("switch_state_begin: %m"); + exit(1); + } + /* * Need to create pidfile here in case we setuid() below * (init_pidfile() exits if it can't initialize pid file) @@ -354,6 +359,7 @@ static void *_slurmctld_signal_hand(void *no_data) /* send REQUEST_SHUTDOWN_IMMEDIATE RPC */ _slurmctld_shutdown(); pthread_join(thread_id_rpc, NULL); + switch_state_fini(); return NULL; /* Normal termination */ break; case SIGHUP: /* kill -1 */ @@ -626,6 +632,7 @@ static void *_slurmctld_background(void *no_data) return NULL; } + /* _save_all_state - save entire slurmctld state for later recovery */ static void _save_all_state(void) { diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 83dbc83d78e..a2c0ab4a363 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -30,6 +30,7 @@ #include <ctype.h> #include <errno.h> +#include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -972,3 +973,113 @@ static void _validate_node_proc_count(void) list_iterator_destroy(part_record_iterator); } #endif + +/* + * switch_state_begin - Recover or initialize switch state + * IN recover - If set, recover switch state as previously saved + * RET 0 if no error, otherwise an error code + */ +int switch_state_begin(int recover) +{ + int error_code = SLURM_SUCCESS; +#ifdef HAVE_LIBELAN3 + qsw_libstate_t old_state = NULL; + Buf buffer = NULL; + char *qsw_state_file = NULL, *data = NULL; + int state_fd, data_allocated, data_read= 0, data_size = 0; + + if (recover) { + /* Read state from file into buffer */ + qsw_state_file = xstrdup (slurmctld_conf.state_save_location); + xstrcat (qsw_state_file, "/qsw_state"); + state_fd = open (qsw_state_file, O_RDONLY); + if (state_fd >= 0) { + data_allocated = BUF_SIZE; + data = xmalloc(data_allocated); + while ((data_read = + read (state_fd, &data[data_size], + BUF_SIZE)) == BUF_SIZE) { + data_size += data_read; + data_allocated += BUF_SIZE; + xrealloc(data, data_allocated); + } + data_size += data_read; + if (data_read < 0) { + error ("Read error on %s, %m", qsw_state_file); + error_code = SLURM_ERROR; + data_size = 0; + } + close (state_fd); + } else + info("No %s file to recover QSW state from", + qsw_state_file); + xfree(qsw_state_file); + + if ((error_code == SLURM_SUCCESS) && data_size) { + if (qsw_alloc_libstate(&old_state)) { + error_code = SLURM_ERROR; + } else { + buffer = create_buf (data, data_size); + if (qsw_unpack_libstate(old_state, buffer) < 0) + error_code = errno; + } + } + if (buffer) + free_buf(buffer); + else if (data) + xfree(data); + + } + if (error_code == SLURM_SUCCESS) + error_code = qsw_init(old_state); + if (old_state) + qsw_free_libstate(old_state); +#endif /* HAVE_LIBELAN3 */ + return error_code; +} + +/* + * switch_state_fini - save switch state and shutdown switch + * RET 0 if no error, otherwise an error code + */ +int switch_state_fini(void) +{ + int error_code = SLURM_SUCCESS; +#ifdef HAVE_LIBELAN3 + qsw_libstate_t old_state = NULL; + Buf buffer = NULL; + char *qsw_state_file = NULL; + int state_fd; + + if (qsw_alloc_libstate(&old_state)) + return errno; + qsw_fini(old_state); + buffer = init_buf(1024); + error_code = qsw_pack_libstate(old_state, buffer); + qsw_state_file = xstrdup (slurmctld_conf.state_save_location); + xstrcat (qsw_state_file, "/qsw_state"); + (void) unlink (qsw_state_file); + state_fd = creat (qsw_state_file, 0600); + if (state_fd == 0) { + error ("Can't save state, error creating file %s %m", + qsw_state_file); + error_code = errno; + } + else { + if (write (state_fd, get_buf_data(buffer), + get_buf_offset(buffer)) != + get_buf_offset(buffer)) { + error ("Can't save state, error writing file %s %m", + qsw_state_file); + error_code = errno; + } + close (state_fd); + } + xfree (qsw_state_file); + if (buffer) + free_buf(buffer); + if (old_state) + qsw_free_libstate(old_state); +#endif /* HAVE_LIBELAN3 */ + return error_code; +} diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 5500af41008..6a9b0cd0c5e 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -923,6 +923,19 @@ extern int step_create ( step_specs *step_specs, extern bool step_on_node(struct job_record *job_ptr, struct node_record *node_ptr); +/* + * switch_state_fini - save switch state and shutdown switch + * RET 0 if no error, otherwise an error code + */ +extern int switch_state_fini(void); + +/* + * switch_state_begin - Recover or initialize switch state + * IN recover - If set, recover switch state as previously saved + * RET 0 if no error, otherwise an error code + */ +extern int switch_state_begin(int recover); + /* * Synchronize the batch job in the system with their files. * All pending batch jobs must have script and environment files -- GitLab