Skip to content
Snippets Groups Projects
Commit 2833b517 authored by Moe Jette's avatar Moe Jette
Browse files

Add support for power saving mode (experimental code to reduce

    clock on nodes that stay in the IDLE state).
parent a6400871
No related branches found
No related tags found
No related merge requests found
......@@ -11,6 +11,8 @@ documents those changes that are of interest to users and admins.
-- Add ability to change MaxNodes and ExcNodeList for pending job
using scontrol.
-- Purge zombie processes spawned via event triggers.
-- Add support for power saving mode (experimental code to reduce
clock on nodes that stay in the IDLE state).
* Changes in SLURM 1.2.6
========================
......
.TH SINFO "1" "November 2006" "sinfo 1.2" "Slurm components"
.TH SINFO "1" "May 2007" "sinfo 1.2" "Slurm components"
.SH "NAME"
sinfo \- view information about SLURM nodes and partitions.
......@@ -347,6 +347,9 @@ node is presently not responding and will not be allocated
any new work. If the node remains non\-responsive, it will
be placed in the \fBDOWN\fR state (except in the case of
\fBDRAINED\fR, \fBDRAINING\fR, or \fBCOMPLETING\fR nodes).
If the node state code is followed by "~", this indicates
the node is presently in a power saving mode (typically
running at reduced frequency).
.TP 12
\fBALLOCATED\fR
The node has been allocated to one or more jobs.
......
......@@ -327,6 +327,9 @@ node is presently not responding and will not be allocated
any new work. If the node remains non\-responsive, it will
be placed in the \fBDOWN\fR state (except in the case of
\fBDRAINED\fR, \fBDRAINING\fR, or \fBCOMPLETING\fR nodes).
If the node state code is followed by "~", this indicates
the node is presently in a power saving mode (typically
running at reduced frequency).
.TP 12
\fBALLOCATED\fR
The node has been allocated to one or more jobs.
......
......@@ -376,6 +376,7 @@ enum node_states {
#define NODE_STATE_DRAIN 0x0200 /* node not be be allocated work */
#define NODE_STATE_COMPLETING 0x0400 /* node is completing allocated job */
#define NODE_STATE_NO_RESPOND 0x0800 /* node is not responding */
#define NODE_STATE_POWER_SAVE 0x1000 /* node in power save mode */
/* used to define the size of the credential.signature size
* used to define the key size of the io_stream_header_t
......
......@@ -636,6 +636,7 @@ char *node_state_string(enum node_states inx)
bool drain_flag = (inx & NODE_STATE_DRAIN);
bool comp_flag = (inx & NODE_STATE_COMPLETING);
bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND);
bool power_flag = (inx & NODE_STATE_POWER_SAVE);
inx = (uint16_t) (inx & NODE_STATE_BASE);
......@@ -670,6 +671,8 @@ char *node_state_string(enum node_states inx)
if (inx == NODE_STATE_IDLE) {
if (no_resp_flag)
return "IDLE*";
if (power_flag)
return "IDLE~";
return "IDLE";
}
if (inx == NODE_STATE_UNKNOWN) {
......@@ -685,6 +688,7 @@ char *node_state_string_compact(enum node_states inx)
bool drain_flag = (inx & NODE_STATE_DRAIN);
bool comp_flag = (inx & NODE_STATE_COMPLETING);
bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND);
bool power_flag = (inx & NODE_STATE_POWER_SAVE);
inx = (uint16_t) (inx & NODE_STATE_BASE);
......@@ -719,6 +723,8 @@ char *node_state_string_compact(enum node_states inx)
if (inx == NODE_STATE_IDLE) {
if (no_resp_flag)
return "IDLE*";
if (power_flag)
return "IDLE~";
return "IDLE";
}
if (inx == NODE_STATE_UNKNOWN) {
......
......@@ -27,6 +27,7 @@ slurmctld_SOURCES = \
partition_mgr.c \
ping_nodes.c \
ping_nodes.h \
power_save.c \
proc_req.c \
proc_req.h \
read_config.c \
......
......@@ -77,7 +77,7 @@ am_slurmctld_OBJECTS = agent.$(OBJEXT) backup.$(OBJEXT) \
controller.$(OBJEXT) job_mgr.$(OBJEXT) job_scheduler.$(OBJEXT) \
locks.$(OBJEXT) node_mgr.$(OBJEXT) node_scheduler.$(OBJEXT) \
partition_mgr.$(OBJEXT) ping_nodes.$(OBJEXT) \
proc_req.$(OBJEXT) read_config.$(OBJEXT) \
power_save.$(OBJEXT) proc_req.$(OBJEXT) read_config.$(OBJEXT) \
sched_plugin.$(OBJEXT) srun_comm.$(OBJEXT) \
state_save.$(OBJEXT) step_mgr.$(OBJEXT) trigger_mgr.$(OBJEXT)
slurmctld_OBJECTS = $(am_slurmctld_OBJECTS)
......@@ -303,6 +303,7 @@ slurmctld_SOURCES = \
partition_mgr.c \
ping_nodes.c \
ping_nodes.h \
power_save.c \
proc_req.c \
proc_req.h \
read_config.c \
......@@ -402,6 +403,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/node_scheduler.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/partition_mgr.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ping_nodes.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/power_save.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/proc_req.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/read_config.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sched_plugin.Po@am__quote@
......
......@@ -174,7 +174,7 @@ typedef struct connection_arg {
int main(int argc, char *argv[])
{
int error_code;
pthread_attr_t thread_attr_save, thread_attr_sig, thread_attr_rpc;
pthread_attr_t thread_attr;
struct stat stat_buf;
/*
......@@ -324,31 +324,41 @@ int main(int argc, char *argv[])
slurm_mutex_lock(&slurmctld_config.thread_count_lock);
slurmctld_config.server_thread_count++;
slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
slurm_attr_init(&thread_attr_rpc);
slurm_attr_init(&thread_attr);
if (pthread_create(&slurmctld_config.thread_id_rpc,
&thread_attr_rpc,_slurmctld_rpc_mgr, NULL))
&thread_attr,_slurmctld_rpc_mgr, NULL))
fatal("pthread_create error %m");
slurm_attr_destroy(&thread_attr_rpc);
slurm_attr_destroy(&thread_attr);
/*
* create attached thread for signal handling
*/
slurm_attr_init(&thread_attr_sig);
slurm_attr_init(&thread_attr);
if (pthread_create(&slurmctld_config.thread_id_sig,
&thread_attr_sig, _slurmctld_signal_hand,
&thread_attr, _slurmctld_signal_hand,
NULL))
fatal("pthread_create %m");
slurm_attr_destroy(&thread_attr_sig);
slurm_attr_destroy(&thread_attr);
/*
* create attached thread for state save
*/
slurm_attr_init(&thread_attr_save);
slurm_attr_init(&thread_attr);
if (pthread_create(&slurmctld_config.thread_id_save,
&thread_attr_save, slurmctld_state_save,
&thread_attr, slurmctld_state_save,
NULL))
fatal("pthread_create %m");
slurm_attr_destroy(&thread_attr_save);
slurm_attr_destroy(&thread_attr);
/*
* create attached thread for node power management
*/
slurm_attr_init(&thread_attr);
if (pthread_create(&slurmctld_config.thread_id_power,
&thread_attr, init_power_save,
NULL))
fatal("pthread_create %m");
slurm_attr_destroy(&thread_attr);
/*
* process slurm background activities, could run as pthread
......@@ -360,6 +370,7 @@ int main(int argc, char *argv[])
pthread_join(slurmctld_config.thread_id_sig, NULL);
pthread_join(slurmctld_config.thread_id_rpc, NULL);
pthread_join(slurmctld_config.thread_id_save, NULL);
pthread_join(slurmctld_config.thread_id_power,NULL);
if (select_g_state_save(slurmctld_conf.state_save_location)
!= SLURM_SUCCESS )
error("failed to save node selection state");
......
/*****************************************************************************\
* power_save.c - support node power saving mode. Nodes which have been
* idle for an extended period of time will be placed into a power saving
* mode by running an arbitrary script (typically to set frequency governor).
* When the node is restored to normal operation, another script will be
* executed. Many parameters are available to control this mode of operation.
*****************************************************************************
* Copyright (C) 2007 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* UCRL-CODE-226842.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include "src/common/bitstring.h"
#include "src/common/xstring.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/slurmctld.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <unistd.h>
/* NOTE: These paramters will be moved into the slurm.conf file in version 1.3
* Directly modify the default values here in order to enable this capability
* in SLURM version 1.2. */
/* Node becomes elligible for power saving mode after being idle for
* this number of seconds. A negative disables power saving mode. */
#define DEFAULT_IDLE_TIME -1
/* Maximum number of nodes to be placed into or removed from power saving mode
* per minute. Use this to prevent rapid changing in power requirements.
* Note that up to DEFAULT_SUSPEND_RATE + DEFAULT_RESUME_RATE processes may
* be created at the same time, so use reasonable limits. A value of zero
* results in no limits being imposed. */
#define DEFAULT_SUSPEND_RATE 60
#define DEFAULT_RESUME_RATE 60
/* Programs to be executed to place nodes or out of power saving mode. These
* are run as user SlurmUser. The hostname of the node to be modified will be
* passed as an argument to the program. */
#define DEFAULT_SUSPEND_PROGRAM "/home/jette/slurm.way/sbin/slurm.node.suspend"
#define DEFAULT_RESUME_PROGRAM "/home/jette/slurm.way/sbin/slurm.node.resume"
/* Individual nodes or all nodes in selected partitions can be excluded from
* being placed into power saving mode. SLURM hostlist expressions can be used.
* Multiple partitions may be listed with a comma separator. */
#define DEFAULT_EXCLUDE_SUSPEND_NODES NULL
#define DEFAULT_EXCLUDE_SUSPEND_PARTITIONS NULL
int idle_time, suspend_rate, resume_rate;
char *suspend_prog = NULL, *resume_prog = NULL;
char *exc_nodes = NULL, *exc_parts = NULL;
bitstr_t *exc_node_bitmap = NULL;
int suspend_cnt, resume_cnt;
static void _do_power_work(void);
static void _do_resume(char *host);
static void _do_suspend(char *host);
static int _init_power_config(void);
static void _kill_zombies(void);
static pid_t _run_prog(char *prog, char *arg);
static bool _valid_prog(char *file_name);
/* Perform any power change work to nodes */
static void _do_power_work(void)
{
int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0;
time_t now = time(NULL), last_work_scan = 0, last_log = 0, delta_t;
uint16_t base_state, susp_state;
bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
struct node_record *node_ptr;
/* Build bitmaps identifying each node which should change state */
for (i=0; i<node_record_count; i++) {
node_ptr = &node_record_table_ptr[i];
base_state = node_ptr->node_state & NODE_STATE_BASE;
susp_state = node_ptr->node_state & NODE_STATE_POWER_SAVE;
if (susp_state)
susp_total++;
if (susp_state
&& ((base_state == NODE_STATE_ALLOCATED)
|| (node_ptr->last_idle > (now - idle_time)))) {
if (wake_cnt == 0)
wake_node_bitmap = bit_alloc(node_record_count);
wake_cnt++;
bit_set(wake_node_bitmap, i);
}
if ((susp_state == 0)
&& (base_state == NODE_STATE_IDLE)
&& (node_ptr->last_idle < (now - idle_time))
&& ((exc_node_bitmap == NULL) ||
(bit_test(exc_node_bitmap, i) == 0))) {
if (sleep_cnt == 0)
sleep_node_bitmap = bit_alloc(node_record_count);
sleep_cnt++;
bit_set(sleep_node_bitmap, i);
}
}
if ((susp_total > 0) && ((now - last_log) > 300))
info("Power save mode %d nodes", susp_total);
if ((wake_cnt == 0) && (sleep_cnt == 0))
goto fini; /* No work to be done now */
/* FIXME: Consider re-running wake command on nodes just in case
* some wake requests failed. This is the place to do such work. */
/* Set limit on counts of nodes to have state changed */
delta_t = now - last_work_scan;
if (delta_t >= 60) {
suspend_cnt = 0;
resume_cnt = 0;
} else {
float rate = (60 - delta_t) / 60.0;
suspend_cnt *= rate;
resume_cnt *= rate;
}
last_work_scan = now;
/* Perform work up to limits */
for (i=0; i<node_record_count; i++) {
node_ptr = &node_record_table_ptr[i];
if ((suspend_cnt <= suspend_rate)
&& sleep_node_bitmap
&& bit_test(sleep_node_bitmap, i)) {
_do_suspend(node_ptr->name);
node_ptr->node_state |= NODE_STATE_POWER_SAVE;
last_node_update = now;
}
if ((resume_cnt <= resume_rate)
&& wake_node_bitmap
&& bit_test(wake_node_bitmap, i)) {
_do_resume(node_ptr->name);
node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
last_node_update = now;
}
}
fini: FREE_NULL_BITMAP(wake_node_bitmap);
FREE_NULL_BITMAP(sleep_node_bitmap);
}
static void _do_resume(char *host)
{
debug("power_save: waking node %s", host);
_run_prog(resume_prog, host);
}
static void _do_suspend(char *host)
{
debug("power_save: suspending node %s", host);
_run_prog(suspend_prog, host);
}
static pid_t _run_prog(char *prog, char *arg)
{
char program[1024], arg0[1024], arg1[1024], *pname;
pid_t child;
strncpy(program, prog, sizeof(program));
pname = strrchr(program, '/');
if (pname == NULL)
pname = program;
else
pname++;
strncpy(arg0, pname, sizeof(arg0));
strncpy(arg1, arg, sizeof(arg1));
child = fork();
if (child == 0) {
int i;
for (i=0; i<128; i++)
close(i);
execl(program, arg0, arg1, NULL);
exit(1);
} else if (child < 0)
error("fork: %m");
return child;
}
/* We don't bother to track individual process IDs,
* just clean everything up here. We could capture
* the value of "child" in _run_prog() if we want
* to track each process. */
static void _kill_zombies(void)
{
while (waitpid(-1, NULL, WNOHANG) > 0)
;
}
/* Initialize power_save module paramters.
* Return 0 on valid configuration to run power saving,
* otherwise log the problem and return -1 */
static int _init_power_config(void)
{
idle_time = DEFAULT_IDLE_TIME;
suspend_rate = DEFAULT_SUSPEND_RATE;
resume_rate = DEFAULT_RESUME_RATE;
if (DEFAULT_SUSPEND_PROGRAM)
suspend_prog = xstrdup(DEFAULT_SUSPEND_PROGRAM);
if (DEFAULT_RESUME_PROGRAM)
resume_prog = xstrdup(DEFAULT_RESUME_PROGRAM);
if (DEFAULT_EXCLUDE_SUSPEND_NODES)
exc_nodes = xstrdup(DEFAULT_EXCLUDE_SUSPEND_NODES);
if (DEFAULT_EXCLUDE_SUSPEND_PARTITIONS)
exc_parts = xstrdup(DEFAULT_EXCLUDE_SUSPEND_PARTITIONS);
if (idle_time < 0) { /* not an error */
debug("power_save module disabled, idle_time < 0");
return -1;
}
if (suspend_rate < 1) {
error("power_save module disabled, suspend_rate < 1");
return -1;
}
if (resume_rate < 1) {
error("power_save module disabled, resume_rate < 1");
return -1;
}
if (suspend_prog == NULL)
info("WARNING: power_save module has NULL suspend program");
else if (!_valid_prog(suspend_prog)) {
error("power_save module disabled, invalid suspend program %s",
suspend_prog);
return -1;
}
if (resume_prog == NULL)
info("WARNING: power_save module has NULL resume program");
else if (!_valid_prog(resume_prog)) {
error("power_save module disabled, invalid resume program %s",
resume_prog);
return -1;
}
if (exc_nodes
&& (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) {
error("power_save module disabled, "
"invalid excluded nodes %s", exc_nodes);
return -1;
}
if (exc_parts) {
char *tmp, *one_part, *part_list;
struct part_record *part_ptr;
int rc = 0;
part_list = xstrdup(exc_parts);
one_part = strtok_r(part_list, ",", &tmp);
while (one_part != NULL) {
part_ptr = find_part_record(one_part);
if (!part_ptr) {
error("power_save module disabled, "
"invalid excluded partition %s",
part_ptr);
rc = -1;
break;
}
if (exc_node_bitmap)
bit_or(exc_node_bitmap, part_ptr->node_bitmap);
else
exc_node_bitmap = bit_copy(part_ptr->node_bitmap);
one_part = strtok_r(NULL, ",", &tmp);
}
xfree(part_list);
if (rc)
return rc;
}
if (exc_node_bitmap) {
char *tmp = bitmap2node_name(exc_node_bitmap);
debug("power_save module, excluded nodes %s", tmp);
xfree(tmp);
}
return 0;
}
static bool _valid_prog(char *file_name)
{
struct stat buf;
if (file_name[0] != '/') {
debug("program %s not absolute pathname", file_name);
return false;
}
if (stat(file_name, &buf)) {
debug("program %s not found", file_name);
return false;
}
if (!S_ISREG(buf.st_mode)) {
debug("program %s not regular file", file_name);
return false;
}
if (buf.st_mode & 022) {
debug("program %s has group or world write permission",
file_name);
return false;
}
return true;
}
/*
* init_power_save - initialize the power save module. Started as a
* pthread. Terminates automatically at slurmctld shutdown time.
* Input and output are unused.
*/
extern void *init_power_save(void *arg)
{
/* Locks: Read config, node, and partitions */
slurmctld_lock_t config_read_lock = {
READ_LOCK, NO_LOCK, READ_LOCK, READ_LOCK };
/* Locks: Write node, read jobs and partitions */
slurmctld_lock_t node_write_lock = {
NO_LOCK, READ_LOCK, WRITE_LOCK, READ_LOCK };
int rc;
time_t now, last_power_scan = 0;
lock_slurmctld(config_read_lock);
rc = _init_power_config();
unlock_slurmctld(config_read_lock);
if (rc)
goto fini;
while (slurmctld_config.shutdown_time == 0) {
sleep(1);
_kill_zombies();
/* Only run every 60 seconds or after
* a node state change, whichever
* happens first */
now = time(NULL);
if ((last_node_update < last_power_scan)
&& (now < (last_power_scan + 60)))
continue;
lock_slurmctld(node_write_lock);
_do_power_work();
unlock_slurmctld(node_write_lock);
last_power_scan = now;
}
fini: /* Free all allocated memory */
xfree(suspend_prog);
xfree(resume_prog);
xfree(exc_nodes);
xfree(exc_parts);
FREE_NULL_BITMAP(exc_node_bitmap);
return NULL;
}
......@@ -3,7 +3,7 @@
*
* $Id$
*****************************************************************************
* Copyright (C) 2002-2006 The Regents of the University of California.
* Copyright (C) 2002-2007 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov> et. al.
* UCRL-CODE-226842.
......@@ -141,12 +141,14 @@ typedef struct slurmctld_config {
pthread_t thread_id_main;
pthread_t thread_id_save;
pthread_t thread_id_sig;
pthread_t thread_id_power;
pthread_t thread_id_rpc;
#else
int thread_count_lock;
int thread_id_main;
int thread_id_save;
int thread_id_sig;
int thread_id_power;
int thread_id_rpc;
#endif
} slurmctld_config_t;
......@@ -675,6 +677,13 @@ extern int init_node_conf ();
*/
extern int init_part_conf (void);
/*
* init_power_save - initialize the power save module. Started as a
* pthread. Terminates automatically at slurmctld shutdown time.
* Input and output are unused.
*/
extern void *init_power_save(void *arg);
/*
* is_node_down - determine if the specified node's state is DOWN
* IN name - name of the node
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment