From 192eeb9b50f736b965c440cc2b157e280d3fc069 Mon Sep 17 00:00:00 2001 From: Tim Wickberg <tim@schedmd.com> Date: Fri, 8 Sep 2017 17:26:26 -0600 Subject: [PATCH] Change the backup to use the heartbeat file instead of the job_state. Only cut over when the heartbeat file is not being updated any longer. Bug 4142. --- src/slurmctld/backup.c | 17 +++++++++-------- src/slurmctld/heartbeat.c | 37 +++++++++++++++++++++++++++++++++++++ src/slurmctld/heartbeat.h | 8 ++++++++ 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/src/slurmctld/backup.c b/src/slurmctld/backup.c index 2177bd483a1..88d4208ac5f 100644 --- a/src/slurmctld/backup.c +++ b/src/slurmctld/backup.c @@ -60,6 +60,7 @@ #include "src/common/xsignal.h" #include "src/common/xstring.h" +#include "src/slurmctld/heartbeat.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/read_config.h" #include "src/slurmctld/slurmctld.h" @@ -160,19 +161,19 @@ void run_backup(slurm_trigger_callbacks_t *callbacks) /* primary no longer respond */ break; } else { - time_t use_time, last_write; + time_t use_time, last_heartbeat; - last_write = get_last_state_write_time(); - debug("%s: last_state_write_time %ld", __func__, - last_write); + last_heartbeat = get_last_heartbeat(); + debug("%s: last_heartbeat %ld", __func__, + last_heartbeat); - if (last_write > last_controller_response) { + if (last_heartbeat > last_controller_response) { error("Last message to the controller was at %ld," - " but the last state update was written at %ld," + " but the last heartbeat was written at %ld," " trusting the filesystem instead of the network" " and not asserting control at this time.", - last_controller_response, last_write); - use_time = last_write; + last_controller_response, last_heartbeat); + use_time = last_heartbeat; } else use_time = last_controller_response; diff --git a/src/slurmctld/heartbeat.c b/src/slurmctld/heartbeat.c index e50999029de..fd7bce03f5e 100644 --- a/src/slurmctld/heartbeat.c +++ b/src/slurmctld/heartbeat.c @@ -35,6 +35,7 @@ \*****************************************************************************/ #include <pthread.h> +#include <time.h> #include "src/common/fd.h" #include "src/common/xstring.h" @@ -166,3 +167,39 @@ void heartbeat_stop(void) } slurm_mutex_unlock(&heartbeat_mutex); } + +#define OPEN_RETRIES 3 + +time_t get_last_heartbeat(void) +{ + char *file; + int fd, i; + uint64_t value; + + file = xstrdup_printf("%s/heartbeat", + slurmctld_conf.state_save_location); + + /* + * Retry the open() in case the primary is rearranging things + * at the moment. Once opened, our handle should persist during + * the shuffle, as the contents are left intact. + */ + for (i = 0; i < OPEN_RETRIES; i++) { + fd = open(file, O_RDONLY); + if (fd < 0) { + error("%s: heartbeat open attempt %d failed from %s.", + __func__, i, file); + return 0; + } + } + + if (read(fd, &value, sizeof(uint64_t)) != sizeof(uint64_t)) { + error("%s: heartbeat read failed from %s.", + __func__, file); + value = 0; + } + + close(fd); + + return (time_t) NTOH_uint64(value); +} diff --git a/src/slurmctld/heartbeat.h b/src/slurmctld/heartbeat.h index efe29fa0cbd..aceb410cbc6 100644 --- a/src/slurmctld/heartbeat.h +++ b/src/slurmctld/heartbeat.h @@ -37,10 +37,18 @@ #ifndef _SLURM_HEARTBEAT_H #define _SLURM_HEARTBEAT_H +#include <time.h> /* for time_t */ + /* launch heartbeat thread */ extern void heartbeat_start(void); /* stop heartbeat thread */ extern void heartbeat_stop(void); +/* + * Returns the last timestamp from the heartbeat file, + * or zero on error. + */ +extern time_t get_last_heartbeat(); + #endif -- GitLab