diff --git a/src/slurmctld/backup.c b/src/slurmctld/backup.c index 2177bd483a16a805d2f039193e6811a811a5af20..88d4208ac5fa1437e7baf28958b7b7fb65ea7dc6 100644 --- a/src/slurmctld/backup.c +++ b/src/slurmctld/backup.c @@ -60,6 +60,7 @@ #include "src/common/xsignal.h" #include "src/common/xstring.h" +#include "src/slurmctld/heartbeat.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/read_config.h" #include "src/slurmctld/slurmctld.h" @@ -160,19 +161,19 @@ void run_backup(slurm_trigger_callbacks_t *callbacks) /* primary no longer respond */ break; } else { - time_t use_time, last_write; + time_t use_time, last_heartbeat; - last_write = get_last_state_write_time(); - debug("%s: last_state_write_time %ld", __func__, - last_write); + last_heartbeat = get_last_heartbeat(); + debug("%s: last_heartbeat %ld", __func__, + last_heartbeat); - if (last_write > last_controller_response) { + if (last_heartbeat > last_controller_response) { error("Last message to the controller was at %ld," - " but the last state update was written at %ld," + " but the last heartbeat was written at %ld," " trusting the filesystem instead of the network" " and not asserting control at this time.", - last_controller_response, last_write); - use_time = last_write; + last_controller_response, last_heartbeat); + use_time = last_heartbeat; } else use_time = last_controller_response; diff --git a/src/slurmctld/heartbeat.c b/src/slurmctld/heartbeat.c index e50999029decc9f71a16a385a779297c6d676d60..fd7bce03f5ec0ea8ce7a8af27b3260842af90c58 100644 --- a/src/slurmctld/heartbeat.c +++ b/src/slurmctld/heartbeat.c @@ -35,6 +35,7 @@ \*****************************************************************************/ #include <pthread.h> +#include <time.h> #include "src/common/fd.h" #include "src/common/xstring.h" @@ -166,3 +167,39 @@ void heartbeat_stop(void) } slurm_mutex_unlock(&heartbeat_mutex); } + +#define OPEN_RETRIES 3 + +time_t get_last_heartbeat(void) +{ + char *file; + int fd, i; + uint64_t value; + + file = xstrdup_printf("%s/heartbeat", + slurmctld_conf.state_save_location); + + /* + * Retry the open() in case the primary is rearranging things + * at the moment. Once opened, our handle should persist during + * the shuffle, as the contents are left intact. + */ + for (i = 0; i < OPEN_RETRIES; i++) { + fd = open(file, O_RDONLY); + if (fd < 0) { + error("%s: heartbeat open attempt %d failed from %s.", + __func__, i, file); + return 0; + } + } + + if (read(fd, &value, sizeof(uint64_t)) != sizeof(uint64_t)) { + error("%s: heartbeat read failed from %s.", + __func__, file); + value = 0; + } + + close(fd); + + return (time_t) NTOH_uint64(value); +} diff --git a/src/slurmctld/heartbeat.h b/src/slurmctld/heartbeat.h index efe29fa0cbdc4dc97c567fa117ace465181b28cb..aceb410cbc66fc7e9daaa6af159e6de817530482 100644 --- a/src/slurmctld/heartbeat.h +++ b/src/slurmctld/heartbeat.h @@ -37,10 +37,18 @@ #ifndef _SLURM_HEARTBEAT_H #define _SLURM_HEARTBEAT_H +#include <time.h> /* for time_t */ + /* launch heartbeat thread */ extern void heartbeat_start(void); /* stop heartbeat thread */ extern void heartbeat_stop(void); +/* + * Returns the last timestamp from the heartbeat file, + * or zero on error. + */ +extern time_t get_last_heartbeat(); + #endif