power_save.c 13.27 KiB
/*****************************************************************************\
* power_save.c - support node power saving mode. Nodes which have been
* idle for an extended period of time will be placed into a power saving
* mode by running an arbitrary script (typically to set frequency governor).
* When the node is restored to normal operation, another script will be
* executed. Many parameters are available to control this mode of operation.
*****************************************************************************
* Copyright (C) 2007 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* UCRL-CODE-226842.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include "src/common/bitstring.h"
#include "src/common/xstring.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/slurmctld.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <unistd.h>
#define _DEBUG 0
/* NOTE: These paramters will be moved into the slurm.conf file in version 1.3
* Directly modify the default values here in order to enable this capability
* in SLURM version 1.2. */
/* Node becomes elligible for power saving mode after being idle for
* this number of seconds. A negative number disables power saving mode. */
#define DEFAULT_IDLE_TIME -1
/* Maximum number of nodes to be placed into or removed from power saving mode
* per minute. Use this to prevent rapid changes in power requirements.
* A value of zero results in no limits being imposed. */
#define DEFAULT_SUSPEND_RATE 60
#define DEFAULT_RESUME_RATE 60
/* Programs to be executed to place nodes or out of power saving mode. These
* are run as user SlurmUser. The hostname of the node to be modified will be
* passed as an argument to the program. */
#define DEFAULT_SUSPEND_PROGRAM "/home/jette/slurm.mdev/sbin/slurm.node.suspend"
#define DEFAULT_RESUME_PROGRAM "/home/jette/slurm.mdev/sbin/slurm.node.resume"
/* Individual nodes or all nodes in selected partitions can be excluded from
* being placed into power saving mode. SLURM hostlist expressions can be used.
* Multiple partitions may be listed with a comma separator. */
#define DEFAULT_EXCLUDE_SUSPEND_NODES NULL
#define DEFAULT_EXCLUDE_SUSPEND_PARTITIONS NULL
int idle_time, suspend_rate, resume_rate;
char *suspend_prog = NULL, *resume_prog = NULL;
char *exc_nodes = NULL, *exc_parts = NULL;
bitstr_t *exc_node_bitmap = NULL;
int suspend_cnt, resume_cnt;
static void _do_power_work(void);
static void _do_resume(char *host);
static void _do_suspend(char *host);
static int _init_power_config(void);
static void _kill_zombies(void);
static void _re_wake(void);
static pid_t _run_prog(char *prog, char *arg);
static bool _valid_prog(char *file_name);
/* Perform any power change work to nodes */
static void _do_power_work(void)
{
static time_t last_log = 0, last_work_scan = 0;
int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0;
time_t now = time(NULL), delta_t;
uint16_t base_state, susp_state;
bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
struct node_record *node_ptr;
/* Set limit on counts of nodes to have state changed */
delta_t = now - last_work_scan;
if (delta_t >= 60) {
suspend_cnt = 0;
resume_cnt = 0;
} else {
float rate = (60 - delta_t) / 60.0;
suspend_cnt *= rate;
resume_cnt *= rate;
}
last_work_scan = now;
/* Build bitmaps identifying each node which should change state */
for (i=0; i<node_record_count; i++) {
node_ptr = &node_record_table_ptr[i];
base_state = node_ptr->node_state & NODE_STATE_BASE;
susp_state = node_ptr->node_state & NODE_STATE_POWER_SAVE;
if (susp_state)
susp_total++;
if (susp_state
&& ((suspend_rate == 0) || (suspend_cnt <= suspend_rate))
&& ((base_state == NODE_STATE_ALLOCATED)
|| (node_ptr->last_idle > (now - idle_time)))) {
if (wake_node_bitmap == NULL)
wake_node_bitmap = bit_alloc(node_record_count);
wake_cnt++;
suspend_cnt++;
node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
bit_set(wake_node_bitmap, i);
}
if ((susp_state == 0)
&& ((resume_rate == 0) || (resume_cnt <= resume_rate))
&& (base_state == NODE_STATE_IDLE)
&& (node_ptr->last_idle < (now - idle_time))
&& ((exc_node_bitmap == NULL) ||
(bit_test(exc_node_bitmap, i) == 0))) {
if (sleep_node_bitmap == NULL)
sleep_node_bitmap = bit_alloc(node_record_count);
sleep_cnt++;
resume_cnt++;
node_ptr->node_state |= NODE_STATE_POWER_SAVE;
bit_set(sleep_node_bitmap, i);
}
}
if ((now - last_log) > 600) {
info("Power save mode %d nodes", susp_total);
last_log = now;
}
if ((wake_cnt == 0) && (sleep_cnt == 0))
_re_wake(); /* No work to be done now */
if (sleep_node_bitmap) {
char *nodes;
nodes = bitmap2node_name(sleep_node_bitmap);
if (nodes)
_do_suspend(nodes);
else
error("power_save: bitmap2nodename");
xfree(nodes);
bit_free(sleep_node_bitmap);
last_node_update = now;
}
if (wake_node_bitmap) {
char *nodes;
nodes = bitmap2node_name(wake_node_bitmap);
if (nodes)
_do_resume(nodes);
else
error("power_save: bitmap2nodename");
xfree(nodes);
bit_free(wake_node_bitmap);
last_node_update = now;
}
}
/* Just in case some resume calls failed, re-issue the requests
* periodically for active nodes. We do not increment resume_cnt
* since there should be no change in power requirements. */
static void _re_wake(void)
{
static time_t last_wakeup = 0;
static int last_inx = 0;
time_t now = time(NULL);
struct node_record *node_ptr;
bitstr_t *wake_node_bitmap = NULL;
int i, lim = MIN(node_record_count, 20);
/* Run at most once per minute */
if ((now - last_wakeup) < 60)
return;
last_wakeup = now;
for (i=0; i<lim; i++) {
node_ptr = &node_record_table_ptr[last_inx];
if ((node_ptr->node_state & NODE_STATE_POWER_SAVE) == 0) {
if (wake_node_bitmap == NULL)
wake_node_bitmap = bit_alloc(node_record_count);
bit_set(wake_node_bitmap, last_inx);
}
last_inx++;
if (last_inx >= node_record_count)
last_inx = 0;
}
if (wake_node_bitmap) {
char *nodes;
nodes = bitmap2node_name(wake_node_bitmap);
if (nodes) {
#if _DEBUG
info("power_save: rewaking nodes %s", nodes);
#else
debug("power_save: rewaking nodes %s", nodes);
#endif
_run_prog(resume_prog, nodes);
} else
error("power_save: bitmap2nodename");
xfree(nodes);
bit_free(wake_node_bitmap);
}
}
static void _do_resume(char *host)
{
#if _DEBUG
info("power_save: waking nodes %s", host);
#else
debug("power_save: waking nodes %s", host);
#endif
_run_prog(resume_prog, host);
}
static void _do_suspend(char *host)
{
#if _DEBUG
info("power_save: suspending nodes %s", host);
#else
debug("power_save: suspending nodes %s", host);
#endif
_run_prog(suspend_prog, host);
}
static pid_t _run_prog(char *prog, char *arg)
{
char program[1024], arg0[1024], arg1[1024], *pname;
pid_t child;
if (prog == NULL) /* disabled, useful for testing */
return -1;
strncpy(program, prog, sizeof(program));
pname = strrchr(program, '/');
if (pname == NULL)
pname = program;
else
pname++;
strncpy(arg0, pname, sizeof(arg0));
strncpy(arg1, arg, sizeof(arg1));
child = fork();
if (child == 0) {
int i;
for (i=0; i<128; i++)
close(i);
execl(program, arg0, arg1, NULL);
exit(1);
} else if (child < 0)
error("fork: %m");
return child;
}
/* We don't bother to track individual process IDs,
* just clean everything up here. We could capture
* the value of "child" in _run_prog() if we want
* to track each process. */
static void _kill_zombies(void)
{
while (waitpid(-1, NULL, WNOHANG) > 0)
;
}
/* Initialize power_save module paramters.
* Return 0 on valid configuration to run power saving,
* otherwise log the problem and return -1 */
static int _init_power_config(void)
{
idle_time = DEFAULT_IDLE_TIME;
suspend_rate = DEFAULT_SUSPEND_RATE;
resume_rate = DEFAULT_RESUME_RATE;
if (DEFAULT_SUSPEND_PROGRAM)
suspend_prog = xstrdup(DEFAULT_SUSPEND_PROGRAM);
if (DEFAULT_RESUME_PROGRAM)
resume_prog = xstrdup(DEFAULT_RESUME_PROGRAM);
if (DEFAULT_EXCLUDE_SUSPEND_NODES)
exc_nodes = xstrdup(DEFAULT_EXCLUDE_SUSPEND_NODES);
if (DEFAULT_EXCLUDE_SUSPEND_PARTITIONS)
exc_parts = xstrdup(DEFAULT_EXCLUDE_SUSPEND_PARTITIONS);
if (idle_time < 0) { /* not an error */
debug("power_save module disabled, idle_time < 0");
return -1;
}
if (suspend_rate < 1) {
error("power_save module disabled, suspend_rate < 1");
return -1;
}
if (resume_rate < 1) {
error("power_save module disabled, resume_rate < 1");
return -1;
}
if (suspend_prog == NULL)
info("WARNING: power_save module has NULL suspend program");
else if (!_valid_prog(suspend_prog)) {
error("power_save module disabled, invalid suspend program %s",
suspend_prog);
return -1;
}
if (resume_prog == NULL)
info("WARNING: power_save module has NULL resume program");
else if (!_valid_prog(resume_prog)) {
error("power_save module disabled, invalid resume program %s",
resume_prog);
return -1;
}
if (exc_nodes
&& (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) {
error("power_save module disabled, "
"invalid excluded nodes %s", exc_nodes);
return -1;
}
if (exc_parts) {
char *tmp, *one_part, *part_list;
struct part_record *part_ptr;
int rc = 0;
part_list = xstrdup(exc_parts);
one_part = strtok_r(part_list, ",", &tmp);
while (one_part != NULL) {
part_ptr = find_part_record(one_part);
if (!part_ptr) {
error("power_save module disabled, "
"invalid excluded partition %s",
part_ptr);
rc = -1;
break;
}
if (exc_node_bitmap)
bit_or(exc_node_bitmap, part_ptr->node_bitmap);
else
exc_node_bitmap = bit_copy(part_ptr->node_bitmap);
one_part = strtok_r(NULL, ",", &tmp);
}
xfree(part_list);
if (rc)
return rc;
}
if (exc_node_bitmap) {
char *tmp = bitmap2node_name(exc_node_bitmap);
debug("power_save module, excluded nodes %s", tmp);
xfree(tmp);
}
return 0;
}
static bool _valid_prog(char *file_name)
{
struct stat buf;
if (file_name[0] != '/') {
debug("program %s not absolute pathname", file_name);
return false;
}
if (stat(file_name, &buf)) {
debug("program %s not found", file_name);
return false;
}
if (!S_ISREG(buf.st_mode)) {
debug("program %s not regular file", file_name);
return false;
}
if (buf.st_mode & 022) {
debug("program %s has group or world write permission",
file_name);
return false;
}
return true;
}
/*
* init_power_save - initialize the power save module. Started as a
* pthread. Terminates automatically at slurmctld shutdown time.
* Input and output are unused.
*/
extern void *init_power_save(void *arg)
{
/* Locks: Read config, node, and partitions */
slurmctld_lock_t config_read_lock = {
READ_LOCK, NO_LOCK, READ_LOCK, READ_LOCK };
/* Locks: Write node, read jobs and partitions */
slurmctld_lock_t node_write_lock = {
NO_LOCK, READ_LOCK, WRITE_LOCK, READ_LOCK };
int rc;
time_t now, last_power_scan = 0;
lock_slurmctld(config_read_lock);
rc = _init_power_config();
unlock_slurmctld(config_read_lock);
if (rc)
goto fini;
while (slurmctld_config.shutdown_time == 0) {
sleep(1);
_kill_zombies();
/* Only run every 60 seconds or after
* a node state change, whichever
* happens first */
now = time(NULL);
if ((last_node_update < last_power_scan)
&& (now < (last_power_scan + 60)))
continue;
lock_slurmctld(node_write_lock);
_do_power_work();
unlock_slurmctld(node_write_lock);
last_power_scan = now;
}
fini: /* Free all allocated memory */
xfree(suspend_prog);
xfree(resume_prog);
xfree(exc_nodes);
xfree(exc_parts);
FREE_NULL_BITMAP(exc_node_bitmap);
return NULL;
}