diff --git a/NEWS b/NEWS index e9a2535f4621916b325cf581f9b2318996ab7ba1..f89fee5d8a9d619fbd18a0ab9463d1a120cc2a25 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,7 @@ documents those changes that are of interest to users and admins. -- Remove srun's --ctrl-comm-ifhn-addr option (for PMI/MPICH2). It is no longer needed. -- Modify power save mode so that nodes can be powered off when idle. See + https://computing.llnl.gov/linux/slurm/power_save.html or "man slurm.conf" (SuspendProgram and related parameters) for more information. diff --git a/doc/html/power_save.shtml b/doc/html/power_save.shtml index 0f4a8de4867bd5b79588ee298179ca4b7c5147c0..68f1b4654feaf1cdc39158fab3f860e4a00b4354 100644 --- a/doc/html/power_save.shtml +++ b/doc/html/power_save.shtml @@ -3,18 +3,20 @@ <h1>Power Saving Guide</h1> <p>SLURM provides an integrated power saving mechanism beginning with version 1.2.7. +Beginning with version 1.4.0, nodes can be fully powered down rather +than only having their power consumption reduced. Nodes that remain idle for an configurable period of time can be placed in a power saving mode. The nodes will be restored to normal operation once work is assigned to them. -Power saving is accomplished using a <i>cpufreq</i> governor that can change -CPU frequency and voltage. +Power saving can be accomplished either using a <i>cpufreq</i> governor +that can change CPU frequency and voltage or by powering down the node. Note that the <i>cpufreq</i> driver must be enabled in the Linux kernel configuration. While the "ondemand" governor can be configured to operate at all times to automatically alter the CPU performance based upon workload, SLURM provides somewhat greater flexibility for power management on a cluster. -Of particular note, SLURM can alter the governors across the cluster +Of particular note, SLURM can power nodes up or down at a configurable rate to prevent rapid changes in power demands. For example, starting a 1000 node job on an idle cluster could result in an instantaneous surge in power demand of multiple megawatts without @@ -41,10 +43,10 @@ The default value is 60. Use this to prevent rapid drops in power requirements.</li> <li><b>ResumeRate</b>: -Maximum number of nodes to be placed into power saving mode +Maximum number of nodes to be removed from power saving mode per minute. A value of zero results in no limits being imposed. -The default value is 60. +The default value is 300. Use this to prevent rapid increasses in power requirements.</li> <li><b>SuspendProgram</b>: @@ -117,6 +119,6 @@ nodes are in power save mode using messages of this sort: You can also configure SLURM without SuspendProgram or ResumeProgram values to assess the potential impact of power saving mode before enabling it.</p> -<p style="text-align:center;">Last modified 14 May 2007</p> +<p style="text-align:center;">Last modified 29 August 2008</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index b0a6509fa02a4d2961d189bd08852e83eec09fb3..6d00f9e0e183355edf54edf602301bb91a550241 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -766,7 +766,7 @@ The value is number of nodes per minute and it can be used to prevent power surges if a large number of nodes in power save mode are assigned work at the same time (e.g. a large job starts). A value of zero results in no limits being imposed. -The default value is 60 nodes per minute. +The default value is 300 nodes per minute. Related configuration options include \fBResumeProgram\fR, \fBSuspendRate\fR, \fBSuspendTime\fR, \fBSuspendProgram\fR, \fBSuspendExcNodes\fR, and \fBSuspendExcParts\fR. diff --git a/src/common/read_config.h b/src/common/read_config.h index 5c6e52774cf8361c352c1d508b2fc4a7ac1d448d..987294c2ebff52bd8abbaef152592b9a9b0a2599 100644 --- a/src/common/read_config.h +++ b/src/common/read_config.h @@ -87,7 +87,7 @@ extern char *default_plugstack; #endif #define DEFAULT_PROPAGATE_PRIO_PROCESS 0 #define DEFAULT_RETURN_TO_SERVICE 0 -#define DEFAULT_RESUME_RATE 60 +#define DEFAULT_RESUME_RATE 300 #define DEFAULT_SAVE_STATE_LOC "/tmp" #define DEFAULT_SCHEDROOTFILTER 1 #define DEFAULT_SCHEDULER_PORT 7321 diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 5a7c99638605067e0e0dd30cb8a9c5fcb55463b4..e12ccddbf5c008815ea6af407128b19fc97105c2 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -153,6 +153,8 @@ typedef struct task_info { typedef struct queued_request { agent_arg_t* agent_arg_ptr; /* The queued request */ + time_t first_attempt; /* Time of first check for batch + * launch RPC *only* */ time_t last_attempt; /* Time of last xmit attempt */ } queued_request_t; @@ -1459,6 +1461,11 @@ static bool _batch_launch_defer(queued_request_t *queued_req_ptr) if (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH) return false; + if (difftime(now, queued_req_ptr->last_attempt) < 5) { + /* Reduce overhead by only testing once every 5 secs */ + return false; + } + launch_msg_ptr = (batch_job_launch_msg_t *)agent_arg_ptr->msg_args; hostlist_deranged_string(agent_arg_ptr->hostlist, sizeof(hostname), hostname); @@ -1472,14 +1479,14 @@ static bool _batch_launch_defer(queued_request_t *queued_req_ptr) if (((node_ptr->node_state & NODE_STATE_POWER_SAVE) == 0) && ((node_ptr->node_state & NODE_STATE_NO_RESPOND) == 0)) { -info("agent ready to send batch request to %s", hostname); queued_req_ptr->last_attempt = (time_t) 0; return false; } - if (queued_req_ptr->last_attempt == 0) - queued_req_ptr->last_attempt = now; - else if (difftime(now, queued_req_ptr->last_attempt) >= + if (queued_req_ptr->last_attempt == 0) { + queued_req_ptr->first_attempt = now; + queued_req_ptr->last_attempt = now; + } else if (difftime(now, queued_req_ptr->first_attempt) >= BATCH_START_TIME) { error("agent waited too long for node %s to come up, " "sending batch request anyway..."); @@ -1487,6 +1494,6 @@ info("agent ready to send batch request to %s", hostname); return false; } -info("agent waiting to send batch request to %s", hostname); + queued_req_ptr->last_attempt = now; return true; }