From 7363961c366b7c1d777c2902a6b46aad0bc4acf3 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 27 Feb 2004 01:29:04 +0000 Subject: [PATCH] The ControllerMachine will periodically tell the BackupController machine to relinquish control. This is necessary since a temporary network problem can result in the BackupController becoming the primary server even while the server on the ControllerMachine continues execution. While this event is impossible to prevent, the new code restores proper operation when communications are restored. (gnats:387) --- src/slurmctld/controller.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index cff832d9f41..a0b71fb61c7 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -80,6 +80,7 @@ #define MIN_CHECKIN_TIME 3 /* Nodes have this number of seconds to * check-in before we ping them */ #define MEM_LEAK_TEST 0 /* Running memory leak test if set */ +#define SHUTDOWN_WAIT 2 /* Time to wait for backup server shutdown */ /* Log to stderr and syslog until becomes a daemon */ @@ -94,6 +95,7 @@ static int daemonize = DEFAULT_DAEMONIZE; static int debug_level = 0; static char *debug_logfile = NULL; static bool dump_core = false; +static char node_name[MAX_NAME_LEN]; static int recover = DEFAULT_RECOVER; static pthread_cond_t server_thread_cond = PTHREAD_COND_INITIALIZER; static pid_t slurmctld_pid; @@ -117,7 +119,7 @@ static void _parse_commandline(int argc, char *argv[], inline static int _report_locks_set(void); static void * _service_connection(void *arg); static int _set_slurmctld_state_loc(void); -static int _shutdown_backup_controller(void); +static int _shutdown_backup_controller(int wait_time); static void * _slurmctld_background(void *no_data); static void * _slurmctld_rpc_mgr(void *no_data); static void * _slurmctld_signal_hand(void *no_data); @@ -133,7 +135,6 @@ typedef struct connection_arg { int main(int argc, char *argv[]) { int error_code; - char node_name[MAX_NAME_LEN]; pthread_attr_t thread_attr_sig, thread_attr_rpc; /* @@ -225,7 +226,7 @@ int main(int argc, char *argv[]) } else if (slurmctld_conf.control_machine && (strcmp(node_name, slurmctld_conf.control_machine) == 0)) { - (void) _shutdown_backup_controller(); + (void) _shutdown_backup_controller(SHUTDOWN_WAIT); /* Now recover the remaining state information */ if ((error_code = read_slurm_conf(recover))) { error("read_slurm_conf reading %s: %m", @@ -600,7 +601,9 @@ static void *_slurmctld_background(void *no_data) static time_t last_ping_node_time; static time_t last_ping_srun_time; static time_t last_timelimit_time; + static time_t last_assert_primary_time; time_t now; + /* Locks: Read job */ slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK @@ -620,7 +623,7 @@ static void *_slurmctld_background(void *no_data) /* Let the dust settle before doing work */ now = time(NULL); last_sched_time = last_checkpoint_time = last_group_time = now; - last_timelimit_time = now; + last_timelimit_time = last_assert_primary_time = now; last_ping_node_time = now + (time_t)MIN_CHECKIN_TIME - (time_t)slurmctld_conf.heartbeat_interval; last_ping_srun_time = now; @@ -706,6 +709,21 @@ static void *_slurmctld_background(void *no_data) save_all_state(); } + /* Reassert this machine as the primary controller. + * A network or security problem could result in + * the backup controller assuming control even + * while the real primary controller is running */ + if (slurmctld_conf.slurmctld_timeout && + slurmctld_conf.backup_addr && + slurmctld_conf.backup_addr[0] && + (difftime(now, last_assert_primary_time) >= + slurmctld_conf.slurmd_timeout) && + node_name && slurmctld_conf.backup_controller && + strcmp(node_name, slurmctld_conf.backup_controller)) { + last_assert_primary_time = now; + (void) _shutdown_backup_controller(0); + } + } debug3("_slurmctld_background shutting down"); @@ -899,9 +917,10 @@ static void _usage(char *prog_name) /* * Tell the backup_controller to relinquish control, primary control_machine * has resumed operation + * wait_time - How long to wait for backup controller to write state, seconds * RET 0 or an error code */ -static int _shutdown_backup_controller(void) +static int _shutdown_backup_controller(int wait_time) { int rc; slurm_msg_t req; @@ -921,7 +940,7 @@ static int _shutdown_backup_controller(void) if (slurm_send_recv_rc_msg(&req, &rc, CONTROL_TIMEOUT) < 0) { error("shutdown_backup:send/recv: %m"); - return SLURM_SOCKET_ERROR; + return SLURM_ERROR; } if (rc) { @@ -935,7 +954,8 @@ static int _shutdown_backup_controller(void) * not presently the case (it returns when no other work is pending, * so the state save should occur right away). We sleep for a while * here and give the backup controller time to shutdown */ - sleep(2); + if (wait_time) + sleep(wait_time); return SLURM_SUCCESS; } -- GitLab