Skip to content
Snippets Groups Projects
power_save.c 13.27 KiB
/*****************************************************************************\
 *  power_save.c - support node power saving mode. Nodes which have been 
 *  idle for an extended period of time will be placed into a power saving 
 *  mode by running an arbitrary script (typically to set frequency governor).
 *  When the node is restored to normal operation, another script will be 
 *  executed. Many parameters are available to control this mode of operation.
 *****************************************************************************
 *  Copyright (C) 2007 The Regents of the University of California.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Morris Jette <jette1@llnl.gov>
 *  UCRL-CODE-226842.
 *  
 *  This file is part of SLURM, a resource management program.
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 *  
 *  SLURM is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission 
 *  to link the code of portions of this program with the OpenSSL library under
 *  certain conditions as described in each individual source file, and 
 *  distribute linked combinations including the two. You must obey the GNU 
 *  General Public License in all respects for all of the code used other than 
 *  OpenSSL. If you modify file(s) with this exception, you may extend this 
 *  exception to your version of the file(s), but you are not obligated to do 
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in 
 *  the program, then also delete it here.
 *  
 *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *  
 *  You should have received a copy of the GNU General Public License along
 *  with SLURM; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/

#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif

#include "src/common/bitstring.h"
#include "src/common/xstring.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/slurmctld.h"

#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <unistd.h>

#define _DEBUG 0

/* NOTE: These paramters will be moved into the slurm.conf file in version 1.3
 * Directly modify the default values here in order to enable this capability
 * in SLURM version 1.2. */

/* Node becomes elligible for power saving mode after being idle for
 * this number of seconds. A negative number disables power saving mode. */
#define DEFAULT_IDLE_TIME	-1

/* Maximum number of nodes to be placed into or removed from power saving mode
 * per minute. Use this to prevent rapid changes in power requirements.
 * A value of zero results in no limits being imposed. */
#define DEFAULT_SUSPEND_RATE	60
#define DEFAULT_RESUME_RATE	60

/* Programs to be executed to place nodes or out of power saving mode. These 
 * are run as user SlurmUser. The hostname of the node to be modified will be
 * passed as an argument to the program. */
#define DEFAULT_SUSPEND_PROGRAM	"/home/jette/slurm.mdev/sbin/slurm.node.suspend"
#define DEFAULT_RESUME_PROGRAM	"/home/jette/slurm.mdev/sbin/slurm.node.resume"

/* Individual nodes or all nodes in selected partitions can be excluded from
 * being placed into power saving mode. SLURM hostlist expressions can be used.
 * Multiple partitions may be listed with a comma separator. */
#define DEFAULT_EXCLUDE_SUSPEND_NODES		NULL
#define DEFAULT_EXCLUDE_SUSPEND_PARTITIONS	NULL

int idle_time, suspend_rate, resume_rate;
char *suspend_prog = NULL, *resume_prog = NULL;
char *exc_nodes = NULL, *exc_parts = NULL;


bitstr_t *exc_node_bitmap = NULL;
int suspend_cnt, resume_cnt;

static void  _do_power_work(void);
static void  _do_resume(char *host);
static void  _do_suspend(char *host);
static int   _init_power_config(void);
static void  _kill_zombies(void);
static void  _re_wake(void);
static pid_t _run_prog(char *prog, char *arg);
static bool  _valid_prog(char *file_name);

/* Perform any power change work to nodes */
static void _do_power_work(void)
{
	static time_t last_log = 0, last_work_scan = 0;
	int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0;
	time_t now = time(NULL), delta_t;
	uint16_t base_state, susp_state;
	bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
	struct node_record *node_ptr;

	/* Set limit on counts of nodes to have state changed */
	delta_t = now - last_work_scan;
	if (delta_t >= 60) {
		suspend_cnt = 0;
		resume_cnt  = 0;
	} else {
		float rate = (60 - delta_t) / 60.0;
		suspend_cnt *= rate;
		resume_cnt  *= rate;
	}
	last_work_scan = now;

	/* Build bitmaps identifying each node which should change state */
	for (i=0; i<node_record_count; i++) {
		node_ptr = &node_record_table_ptr[i];
		base_state = node_ptr->node_state & NODE_STATE_BASE;
		susp_state = node_ptr->node_state & NODE_STATE_POWER_SAVE;

		if (susp_state)
			susp_total++;
		if (susp_state
		&&  ((suspend_rate == 0) || (suspend_cnt <= suspend_rate))
		&&  ((base_state == NODE_STATE_ALLOCATED)
		||   (node_ptr->last_idle > (now - idle_time)))) {
			if (wake_node_bitmap == NULL)
				wake_node_bitmap = bit_alloc(node_record_count);
			wake_cnt++;
			suspend_cnt++;
			node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
			bit_set(wake_node_bitmap, i);
		}
		if ((susp_state == 0)
		&&  ((resume_rate == 0) || (resume_cnt <= resume_rate))
		&&  (base_state == NODE_STATE_IDLE)
		&&  (node_ptr->last_idle < (now - idle_time))
		&&  ((exc_node_bitmap == NULL) || 
		     (bit_test(exc_node_bitmap, i) == 0))) {
			if (sleep_node_bitmap == NULL)
				sleep_node_bitmap = bit_alloc(node_record_count);
			sleep_cnt++;
			resume_cnt++;
			node_ptr->node_state |= NODE_STATE_POWER_SAVE;
			bit_set(sleep_node_bitmap, i);
		}
	}
	if ((now - last_log) > 600) {
		info("Power save mode %d nodes", susp_total);
		last_log = now;
	}

	if ((wake_cnt == 0) && (sleep_cnt == 0))
		_re_wake();	/* No work to be done now */

	if (sleep_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(sleep_node_bitmap);
		if (nodes)
			_do_suspend(nodes);
		else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		bit_free(sleep_node_bitmap);
		last_node_update = now;
	}

	if (wake_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(wake_node_bitmap);
		if (nodes)
			_do_resume(nodes);
		else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		bit_free(wake_node_bitmap);
		last_node_update = now;
	}
}

/* Just in case some resume calls failed, re-issue the requests
 * periodically for active nodes. We do not increment resume_cnt
 * since there should be no change in power requirements. */
static void _re_wake(void)
{
	static time_t last_wakeup = 0;
	static int last_inx = 0;
	time_t now = time(NULL);
	struct node_record *node_ptr;
	bitstr_t *wake_node_bitmap = NULL;
	int i, lim = MIN(node_record_count, 20);

	/* Run at most once per minute */
	if ((now - last_wakeup) < 60)
		return;
	last_wakeup = now;

	for (i=0; i<lim; i++) {
		node_ptr = &node_record_table_ptr[last_inx];
		if ((node_ptr->node_state & NODE_STATE_POWER_SAVE) == 0) {
			if (wake_node_bitmap == NULL)
				wake_node_bitmap = bit_alloc(node_record_count);
			bit_set(wake_node_bitmap, last_inx);
		}
		last_inx++;
		if (last_inx >= node_record_count)
			last_inx = 0;
	}

	if (wake_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(wake_node_bitmap);
		if (nodes) {
#if _DEBUG
			info("power_save: rewaking nodes %s", nodes);
#else
			debug("power_save: rewaking nodes %s", nodes);
#endif
			_run_prog(resume_prog, nodes);	
		} else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		bit_free(wake_node_bitmap);
	}		
}

static void _do_resume(char *host)
{
#if _DEBUG
	info("power_save: waking nodes %s", host);
#else
	debug("power_save: waking nodes %s", host);
#endif
	_run_prog(resume_prog, host);	
}

static void _do_suspend(char *host)
{
#if _DEBUG
	info("power_save: suspending nodes %s", host);
#else
	debug("power_save: suspending nodes %s", host);
#endif
	_run_prog(suspend_prog, host);	
}

static pid_t _run_prog(char *prog, char *arg)
{
	char program[1024], arg0[1024], arg1[1024], *pname;
	pid_t child;

	if (prog == NULL)	/* disabled, useful for testing */
		return -1;

	strncpy(program, prog, sizeof(program));
	pname = strrchr(program, '/');
	if (pname == NULL)
		pname = program;
	else
		pname++;
	strncpy(arg0, pname, sizeof(arg0));
	strncpy(arg1, arg, sizeof(arg1));

	child = fork();
	if (child == 0) {
		int i;
		for (i=0; i<128; i++)
			close(i);
		execl(program, arg0, arg1, NULL);
		exit(1);
	} else if (child < 0)
		error("fork: %m");
	return child;
}

/* We don't bother to track individual process IDs, 
 * just clean everything up here. We could capture 
 * the value of "child" in _run_prog() if we want 
 * to track each process. */
static void  _kill_zombies(void)
{
	while (waitpid(-1, NULL, WNOHANG) > 0)
		;
}

/* Initialize power_save module paramters.
 * Return 0 on valid configuration to run power saving,
 * otherwise log the problem and return -1 */
static int _init_power_config(void)
{
	idle_time     = DEFAULT_IDLE_TIME;
	suspend_rate  = DEFAULT_SUSPEND_RATE;
	resume_rate   = DEFAULT_RESUME_RATE;
	if (DEFAULT_SUSPEND_PROGRAM)
		suspend_prog = xstrdup(DEFAULT_SUSPEND_PROGRAM);
	if (DEFAULT_RESUME_PROGRAM)
		resume_prog = xstrdup(DEFAULT_RESUME_PROGRAM);
	if (DEFAULT_EXCLUDE_SUSPEND_NODES)
		exc_nodes = xstrdup(DEFAULT_EXCLUDE_SUSPEND_NODES);
	if (DEFAULT_EXCLUDE_SUSPEND_PARTITIONS)
		exc_parts = xstrdup(DEFAULT_EXCLUDE_SUSPEND_PARTITIONS);

	if (idle_time < 0) {	/* not an error */
		debug("power_save module disabled, idle_time < 0");
		return -1;
	}
	if (suspend_rate < 1) {
		error("power_save module disabled, suspend_rate < 1");
		return -1;
	}
	if (resume_rate < 1) {
		error("power_save module disabled, resume_rate < 1");
		return -1;
	}
	if (suspend_prog == NULL)
		info("WARNING: power_save module has NULL suspend program");
	else if (!_valid_prog(suspend_prog)) {
		error("power_save module disabled, invalid suspend program %s",
			suspend_prog);
		return -1;
	}
	if (resume_prog == NULL)
		info("WARNING: power_save module has NULL resume program");
	else if (!_valid_prog(resume_prog)) {
		error("power_save module disabled, invalid resume program %s",
			resume_prog);
		return -1;
	}

	if (exc_nodes
	&&  (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) {
		error("power_save module disabled, "
			"invalid excluded nodes %s", exc_nodes);
		return -1;
	}

	if (exc_parts) {
		char *tmp, *one_part, *part_list;
		struct part_record *part_ptr;
		int rc = 0;
		part_list = xstrdup(exc_parts);
		one_part = strtok_r(part_list, ",", &tmp);
		while (one_part != NULL) {
			part_ptr = find_part_record(one_part);
			if (!part_ptr) {
				error("power_save module disabled, "
					"invalid excluded partition %s",
					part_ptr);
				rc = -1;
				break;
			}
			if (exc_node_bitmap)
				bit_or(exc_node_bitmap, part_ptr->node_bitmap);
			else
				exc_node_bitmap = bit_copy(part_ptr->node_bitmap);
			one_part = strtok_r(NULL, ",", &tmp);
		}
		xfree(part_list);
		if (rc)
			return rc;
	}

	if (exc_node_bitmap) {
		char *tmp = bitmap2node_name(exc_node_bitmap);
		debug("power_save module, excluded nodes %s", tmp);
		xfree(tmp);
	}

	return 0;
}

static bool _valid_prog(char *file_name)
{
	struct stat buf;

	if (file_name[0] != '/') {
		debug("program %s not absolute pathname", file_name);
		return false;
	}

	if (stat(file_name, &buf)) {
		debug("program %s not found", file_name);
		return false;
	}
	if (!S_ISREG(buf.st_mode)) {
		debug("program %s not regular file", file_name);
		return false;
	}
	if (buf.st_mode & 022) {
		debug("program %s has group or world write permission",
			file_name);
		return false;
	}
	return true;
}

/*
 * init_power_save - initialize the power save module. Started as a
 *	pthread. Terminates automatically at slurmctld shutdown time.
 *	Input and output are unused.
 */
extern void *init_power_save(void *arg)
{
        /* Locks: Read config, node, and partitions */
        slurmctld_lock_t config_read_lock = {
                READ_LOCK, NO_LOCK, READ_LOCK, READ_LOCK };
        /* Locks: Write node, read jobs and partitions */
        slurmctld_lock_t node_write_lock = {
                NO_LOCK, READ_LOCK, WRITE_LOCK, READ_LOCK };
	int rc;
	time_t now, last_power_scan = 0;

	lock_slurmctld(config_read_lock);
	rc = _init_power_config();
	unlock_slurmctld(config_read_lock);
	if (rc)
		goto fini;

	while (slurmctld_config.shutdown_time == 0) {
		sleep(1);
		_kill_zombies();

		/* Only run every 60 seconds or after
		 * a node state change, whichever 
		 * happens first */
		now = time(NULL);
		if ((last_node_update < last_power_scan)
		&&  (now < (last_power_scan + 60)))
			continue;

		lock_slurmctld(node_write_lock);
		_do_power_work();
		unlock_slurmctld(node_write_lock);
		last_power_scan = now;
	}

fini:	/* Free all allocated memory */
	xfree(suspend_prog);
	xfree(resume_prog);
	xfree(exc_nodes);
	xfree(exc_parts);
	FREE_NULL_BITMAP(exc_node_bitmap);
	return NULL;
}