From e735778d16e8624550a68fc8297d0e78ffa3192d Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Fri, 10 Apr 2009 20:43:14 +0000 Subject: [PATCH] added backups --- src/slurmdbd/backup.c | 180 ++++++++++++++++++++++++++++++++++++++++++ src/slurmdbd/backup.h | 54 +++++++++++++ 2 files changed, 234 insertions(+) create mode 100644 src/slurmdbd/backup.c create mode 100644 src/slurmdbd/backup.h diff --git a/src/slurmdbd/backup.c b/src/slurmdbd/backup.c new file mode 100644 index 00000000000..096b15d138c --- /dev/null +++ b/src/slurmdbd/backup.c @@ -0,0 +1,180 @@ +/*****************************************************************************\ + * backup.c - backup slurm dbd + ***************************************************************************** + * Copyright (C) 2009 Lawrence Livermore National Security. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov> + * CODE-OCEC-09-009. All rights reserved. + * + * This file is part of SLURM, a resource management program. + * For details, see <https://computing.llnl.gov/linux/slurm/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#include <sys/poll.h> + +#include "src/common/xmalloc.h" +#include "src/common/slurm_protocol_defs.h" +#include "src/common/fd.h" +#include "src/common/log.h" +#include "src/common/slurmdbd_defs.h" + +#include "src/slurmdbd/backup.h" + +static slurm_fd slurmdbd_fd = -1; + +/* Return time in msec since "start time" */ +static int _tot_wait (struct timeval *start_time) +{ + struct timeval end_time; + int msec_delay; + + gettimeofday(&end_time, NULL); + msec_delay = (end_time.tv_sec - start_time->tv_sec ) * 1000; + msec_delay += ((end_time.tv_usec - start_time->tv_usec + 500) / 1000); + return msec_delay; +} + +/* Wait until a file is readable, + * RET false if can not be read */ +static bool _fd_readable(slurm_fd fd, int read_timeout) +{ + struct pollfd ufds; + int rc, time_left; + struct timeval tstart; + + ufds.fd = fd; + ufds.events = POLLIN; + gettimeofday(&tstart, NULL); + while (shutdown_time == 0) { + time_left = read_timeout - _tot_wait(&tstart); + rc = poll(&ufds, 1, time_left); + if (rc == -1) { + if ((errno == EINTR) || (errno == EAGAIN)) + continue; + error("poll: %m"); + return false; + } + if (rc == 0) + return false; + if (ufds.revents & POLLHUP) { + debug2("Primary SlurmDBD connection closed"); + return false; + } + if (ufds.revents & POLLNVAL) { + error("Primary SlurmDBD connection is invalid"); + return false; + } + if (ufds.revents & POLLERR) { + error("Primary SlurmDBD connection " + "experienced an error"); + return false; + } + if ((ufds.revents & POLLIN) == 0) { + error("SlurmDBD connection %d events %d", + fd, ufds.revents); + return false; + } + /* revents == POLLIN */ + return true; + } + return false; +} + +/* Open a connection to the Slurm DBD and set slurmdbd_fd */ +static void _open_slurmdbd_fd(slurm_addr dbd_addr) +{ + if(dbd_addr.sin_port == 0) { + error("sin_port == 0 in the slurmdbd backup"); + return; + } + + slurmdbd_fd = slurm_open_msg_conn(&dbd_addr); + + if (slurmdbd_fd >= 0) + fd_set_nonblocking(slurmdbd_fd); +} + +/* Close the SlurmDbd connection */ +static void _close_slurmdbd_fd(void) +{ + if (slurmdbd_fd >= 0) { + close(slurmdbd_fd); + slurmdbd_fd = -1; + } +} + +/* Reopen the Slurm DBD connection due to some error */ +static void _reopen_slurmdbd_fd(slurm_addr dbd_addr) +{ + _close_slurmdbd_fd(); + _open_slurmdbd_fd(dbd_addr); +} + +/* run_backup - this is the backup controller, it should run in standby + * mode, assuming control when the primary controller stops responding */ +extern void run_backup(void) +{ + slurm_addr dbd_addr; + + sleep(5); /* Give the primary slurmdbd set-up time */ + + /* get a connection */ + slurm_set_addr(&dbd_addr, slurmdbd_conf->dbd_port, + slurmdbd_conf->dbd_host); + + if (dbd_addr.sin_port == 0) + error("Unable to locate SlurmDBD host %s:%u", + slurmdbd_conf->dbd_host, slurmdbd_conf->dbd_port); + else + _open_slurmdbd_fd(dbd_addr); + + + /* repeatedly ping Primary */ + while (!shutdown_time) { + bool readable = _fd_readable( + slurmdbd_fd, slurm_get_msg_timeout() * 1000); + + if (have_control && readable) { + rpc_mgr_wake(); + rollup_handler_cancel(); + have_control = false; + } else if(!have_control && !readable) { + have_control = true; + break; + } + + sleep(1); + if(!readable) + _reopen_slurmdbd_fd(dbd_addr); + } + + _close_slurmdbd_fd(); + + return; +} diff --git a/src/slurmdbd/backup.h b/src/slurmdbd/backup.h new file mode 100644 index 00000000000..b60ce21d061 --- /dev/null +++ b/src/slurmdbd/backup.h @@ -0,0 +1,54 @@ +/*****************************************************************************\ + * backup.h - backup slurm dbd + ***************************************************************************** + * Copyright (C) 2009 Lawrence Livermore National Security. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov> + * CODE-OCEC-09-009. All rights reserved. + * + * This file is part of SLURM, a resource management program. + * For details, see <https://computing.llnl.gov/linux/slurm/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifndef _DBDBACKUP_H +#define _DBDBACKUP_H + +#include "src/slurmdbd/read_config.h" +#include "src/slurmdbd/rpc_mgr.h" +#include "src/slurmdbd/slurmdbd.h" + +extern bool backup; +extern bool have_control; + +/* run_backup - this is the backup dbd, it should run in standby + * mode, assuming control when the primary dbd stops responding */ +extern void run_backup(void); + + +#endif -- GitLab