From 577d4f8375a1aef96b0076f11c412215c3dc3447 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Wed, 28 Jan 2015 11:07:53 -0800 Subject: [PATCH] Power management work Flesh out power management web page Change default increase_rate from 10% to 20% Add PowerParameters option of recent_job Add reconfigure and job_resume APIs Improve shutdown timing using a pthread_cond flag --- doc/html/power_mgmt.shtml | 147 ++++++++++++++++++++++++++-- doc/man/man5/slurm.conf.5 | 11 ++- src/common/power.c | 32 +++++- src/common/power.h | 14 ++- src/plugins/power/cray/power_cray.c | 55 ++++++++++- src/plugins/power/none/power_none.c | 12 +++ src/slurmctld/job_mgr.c | 2 + src/slurmctld/read_config.c | 4 +- 8 files changed, 258 insertions(+), 19 deletions(-) diff --git a/doc/html/power_mgmt.shtml b/doc/html/power_mgmt.shtml index a3a2ed7600e..638ff857b59 100644 --- a/doc/html/power_mgmt.shtml +++ b/doc/html/power_mgmt.shtml @@ -2,8 +2,18 @@ <h1>Slurm Power Management Guide</h1> -<p>Slurm provied an integrated power management system for power capping. -This is distict from Slurm's ability to +<p>Slurm provide an integrated power management system for power capping. +The mode of operation is to take the configured power cap for the system and +distribute it across the compute nodes under Slurm control. +Initially that power is distributed evenly across all compute nodes. +Slurm then monitors actual power consumption and redistributes power as appropriate. +Specifically, Slurm lowers the power caps on nodes using less then their cap +and redistributes that power across the other nodes. +The thresholds at which a node's power cap are raised or lowered are configurable +as are the rate of change the power cap. +In addition, starting a job on a node immediately triggers resetting the node's +power cap to a higher level. +Note this functionality is distinct from Slurm's ability to <a href="power_save.html">power down idle nodes</a>.</p> <h2>Configuration</h2> @@ -13,24 +23,147 @@ This is distict from Slurm's ability to <li><b>PowerParameters</b>: Defines power management behavior. +Changes to this value take effect when the Slurm daemons are reconfigured. Currently valid options are: <ul> -<li>ClusterCap=#[KW|MW] - Used to establish cluster-wide power cap. Default - units are watts.</li> +<li><b>balance_interval=#</b> - + Specifies the time interval, in seconds, between attempts to balance power + caps across the nodes. + The default value is 30 seconds. + Supported by the power/cray plugin.</li> +<li><b>capmc_path=</b> - + Specifies the absolute path of the <b>capmc</b> command. + The default value is "/opt/cray/capmc/default/bin/capmc". + Supported by the power/cray plugin.</li> +<li><b>cap_watts=#[KW|MW]</b> - + Specifies the power limit to be established across all compute nodes managed + by Slurm. + The default value is no limit. + Supported by the power/cray plugin.</li> +<li><b>decrease_rate=#</b> - + Specifies the maximum rate of change in the power cap for a node where the + actual power usage is below the power cap by an amount greater than + lower_threshold (see below). + Value represents a percentage of the difference between a node's minimum and + maximum power consumption. + The default value is 50 percent. + Supported by the power/cray plugin.</li> +<li><b>increase_rate=#</b> - + Specifies the maximum rate of change in the power cap for a node where the + actual power usage is within upper_threshold (see below) of the power cap. + Value represents a percentage of the difference between a node's minimum and + maximum power consumption. + The default value is 20 percent. + Supported by the power/cray plugin.</li> +<li><b>lower_threshold=#</b> - + Specify a lower power consumption threshold. + If a node's current power consumption is below this percentage of its current + cap, then its power cap will be reduced. + The default value is 90 percent. + Supported by the power/cray plugin. +<li><b>recent_job=#</b> - + If a job has started or resumed execution (from suspend) on a compute node + within this number of seconds from the current time, the node's power cap will + be increased to the maximum. + The default value is 300 seconds. + Supported by the power/cray plugin. +<li><b>upper_threshold=#</b> - + Specify an upper power consumption threshold. + If a node's current power consumption is above this percentage of its current + cap, then its power cap will be increased to the extent possible. + A node's power cap will also be increased if a job is newly started on it. + The default value is 95 percent. + Supported by the power/cray plugin. </ul></li> <li><b>PowerPlugin</b>: Identifies the plugin used to manage system power consumption. +Changes to this value require restarting Slurm daemons to take effect. Currently valid options are: <ul> -<li>cray - Used for Cray systems with power monitoring and management +<li><b>power/cray</b> - + Used for Cray systems with power monitoring and management functionality included as part of System Management Workstation (SMW) 7.0.UP03.</li> -<li>none - No power management support. The default value.</li> +<li><b>power/none</b> - No power management support. The default value.</li> </ul></li> +</ul> + +<h2>Example</h2> + +<h3>Initial State</h3> +<p>In our example, assume the following configuration: +10 compute node cluster, where each node has a minimum power consumption of 100 watts +and maximum power consumption of 200 watts. +The following values for PowerParameters: +balance_interval=60, +cap_watts=1800, +decrease_rate=30, increase_rate=10, +lower_threshold=90, upper_threshold=98. +The initial state is simply based upon the cap_watts divided by the number of +compute nodes: 1800 watts / 10 nodes = 180 watts per node.</p> + +<h3>State in 60 Seconds</h3> +<p>The power consumption is then examined balance_interval (60) seconds later. +Assume that one of those nodes is consuming 110 watts and the others are +using 180 watts. +First we identify which nodes are consuming less than their lower_threshold +of the power cap: 90% x 180 watts = 162 watts. +One node falls in this category with 110 watts of power consumption. +Its power cap is reduced by either half of the difference between it's current +power cap and power consumption ((180 watts - 110 watts) / 2 = 35 watts) OR +decrease_rate, which is a percentage of the difference between the node's +maximum and minimum power consumption ((200 watts - 100 watts) x 30% = 30 watts). +So that node's power cap is reduce from 180 watts to 150 watts. +Ignoring the upper_threshold parameter for now, we now have 1650 watts available +to distribute to the remaining 9 compute nodes, or 183 watts per node +(1650 watts / 9 nodes = 183 watts per node).</p> +<h3>State in 120 Seconds</h3> +<p>The power consumption is then examined balance_interval (60) seconds later. +Assume that one of those nodes is still consuming 110 watts, a second node is +consuming 115 watts and the other eight are using 183 watts. +First we identify which nodes are consuming less than their lower_threshold. +Our node using 110 watts has its cap reduced by half of the difference between +it's current power cap and power consumption +((150 watts - 110 watts) / 2 = 20 watts); +so that node's power cap is reduce from 150 watts to 130 watts. +The node consuming 115 watts has its power cap reduced by 30 watts based +decrease_rate; so that node's power cap is reduce from 183 watts to 153 watts. +That leaves 1517 watts (1800 watts - 130 watts - 153 watts = 1517 watts) to +be distributed over 8 nodes or 189 watts per node.</p> + +<h3>State in 180 Seconds</h3> +<p>The power consumption is then examined balance_interval (60) seconds later. +Assume the node previously consuming 110 watts is now consuming 128 watts. +Since that is over upper_threshold of it's power cap +(98% x 130 watts = 127 watts), its power cap is increased by increase_rate +((200 watts - 100 watts) x 10% = 10 watts), so its power cap goes from +130 watts to 140 watts. +Assume the node previously consuming 115 watts has been allocated a new job. +This triggers the node to be allocated the same power cap as nodes previously +running at their power cap. +Therefore we have 1660 watts available (1800 watts - 140 watts = 1660 watts) +to be distributed over 9 nodes or 184 watts per node.</p> + +<h2>Notes</h2> +<ul> +<li>We need to determine how the current power consumption data is gathered + and determine accuracy of that data.</li> +<li>The current algorithm manages power on a per-node basis without considering + job allocations. Should an attempt be made to maintain uniform power caps + across all nodes associated with each job? Doing so could greatly complicate + the logic.</li> +<li>Current default values for configuration parameters should probably be + changed once we have a better understanding of the algorithm's behavior.</li> +<li>No integration of this logic with gang scheduling currently exists. + It is not clear that configuration is practical to support as gang scheduling + time slices will typically be smaller than the power management + balance_interval.</li> +<li>No integration with logic powering down idle nodes is currently implemented. + That should be practical to add, but has not yet been studied.</li> </ul> -<p style="text-align:center;">Last modified 20 January 2015</p> +<p style="text-align:center;">Last modified 28 January 2015</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 5bf3f42f35b..a366bdce95a 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1402,6 +1402,7 @@ on SPANK plugins, see the \fBspank\fR(8) manual. \fBPowerParameters\fR System power management parameters. The supported parameters are specific to the \fBPowerPlugin\fR. +Changes to this value take effect when the Slurm daemons are reconfigured. More information about system power management is available here <http://slurm.schedmd.com/power_mgmt.html>. Options current supported by any plugins are listed below. @@ -1438,7 +1439,7 @@ Specifies the maximum rate of change in the power cap for a node where the actual power usage is within \fBupper_threshold\fR (see below) of the power cap. Value represents a percentage of the difference between a node's minimum and maximum power consumption. -The default value is 10 percent. +The default value is 20 percent. Supported by the power/cray plugin. .TP \fBlower_threshold=#\fR @@ -1448,6 +1449,13 @@ cap, then its power cap will be reduced. The default value is 90 percent. Supported by the power/cray plugin. .TP +\fBrecent_job=#\fR +If a job has started or resumed execution (from suspend) on a compute node +within this number of seconds from the current time, the node's power cap will +be increased to the maximum. +The default value is 300 seconds. +Supported by the power/cray plugin. +.TP \fBupper_threshold=#\fR Specify an upper power consumption threshold. If a node's current power consumption is above this percentage of its current @@ -1462,6 +1470,7 @@ Identifies the plugin used for system power management. Currently supported plugins include: \fBcray\fR and \fBnone\fR (default). +Changes to this value require restarting Slurm daemons to take effect. More information about system power management is available here <http://slurm.schedmd.com/power_mgmt.html>. diff --git a/src/common/power.c b/src/common/power.c index 8a2b8a3a7d7..63ffd2f9de0 100644 --- a/src/common/power.c +++ b/src/common/power.c @@ -54,7 +54,9 @@ * working. If you need to add fields, add them at the end of the structure. */ typedef struct slurm_power_ops { + void (*job_resume) (struct job_record *job_ptr); void (*job_start) (struct job_record *job_ptr); + void (*reconfig) (void); } slurm_power_ops_t; /* @@ -62,7 +64,9 @@ typedef struct slurm_power_ops { * declared for slurm_power_ops_t. */ static const char *syms[] = { - "power_p_job_start" + "power_p_job_resume", + "power_p_job_start", + "power_p_reconfig" }; static int g_context_cnt = -1; @@ -72,6 +76,7 @@ static char *power_plugin_list = NULL; static pthread_mutex_t g_context_lock = PTHREAD_MUTEX_INITIALIZER; static bool init_run = false; +/* Initialize the power plugin */ extern int power_g_init(void) { int rc = SLURM_SUCCESS; @@ -125,6 +130,7 @@ fini: return rc; } +/* Terminate the power plugin and free all memory */ extern int power_g_fini(void) { int i, j, rc = SLURM_SUCCESS; @@ -151,6 +157,30 @@ fini: slurm_mutex_unlock(&g_context_lock); return rc; } +/* Read the configuration file */ +extern void power_g_reconfig(void) +{ + int i; + + (void) power_g_init(); + slurm_mutex_lock(&g_context_lock); + for (i = 0; i < g_context_cnt; i++) + (*(ops[i].reconfig))(); + slurm_mutex_unlock(&g_context_lock); +} + +/* Note that a suspended job has been resumed */ +extern void power_g_job_resume(struct job_record *job_ptr) +{ + int i; + + (void) power_g_init(); + slurm_mutex_lock(&g_context_lock); + for (i = 0; i < g_context_cnt; i++) + (*(ops[i].job_resume))(job_ptr); + slurm_mutex_unlock(&g_context_lock); +} + /* Note that a job has been allocated resources and is ready to start */ extern void power_g_job_start(struct job_record *job_ptr) { diff --git a/src/common/power.h b/src/common/power.h index eb34e5d1bcf..6eaa731608e 100644 --- a/src/common/power.h +++ b/src/common/power.h @@ -48,11 +48,17 @@ /*****************************************************************************\ * PLUGIN FUNCTIONS \*****************************************************************************/ -/* initialize the power plugin */ -extern int power_g_init (void); +/* Initialize the power plugin */ +extern int power_g_init(void); -/* terminate the power plugin and free all memory */ -extern int power_g_fini (void); +/* Terminate the power plugin and free all memory */ +extern int power_g_fini(void); + +/* Read the configuration file */ +extern void power_g_reconfig(void); + +/* Note that a suspended job has been resumed */ +extern void power_g_job_resume(struct job_record *job_ptr); /* Note that a job has been allocated resources and is ready to start */ extern void power_g_job_start(struct job_record *job_ptr); diff --git a/src/plugins/power/cray/power_cray.c b/src/plugins/power/cray/power_cray.c index ad3b0aa3d54..f71dfdc67bd 100644 --- a/src/plugins/power/cray/power_cray.c +++ b/src/plugins/power/cray/power_cray.c @@ -63,7 +63,7 @@ #define DEFAULT_CAPMC_PATH "/opt/cray/capmc/default/bin/capmc" #define DEFAULT_CAP_WATTS 0 #define DEFAULT_DECREASE_RATE 50 -#define DEFAULT_INCREASE_RATE 10 +#define DEFAULT_INCREASE_RATE 20 #define DEFAULT_LOWER_THRESHOLD 90 #define DEFAULT_UPPER_THRESHOLD 95 #define DEFAULT_RECENT_JOB 300 @@ -128,6 +128,7 @@ static uint64_t debug_flag = 0; static uint32_t decrease_rate = DEFAULT_DECREASE_RATE; static uint32_t increase_rate = DEFAULT_INCREASE_RATE; static uint32_t lower_threshold = DEFAULT_LOWER_THRESHOLD; +static uint32_t recent_job = DEFAULT_RECENT_JOB; static uint32_t upper_threshold = DEFAULT_UPPER_THRESHOLD; static bool stop_power = false; static pthread_t power_thread = 0; @@ -217,6 +218,15 @@ static void _load_config(void) } } + if ((tmp_ptr = strstr(sched_params, "recent_job="))) { + recent_job = atoi(tmp_ptr + 11); + if (recent_job < 1) { + error("PowerParameters: recent_job=%u invalid", + recent_job); + recent_job = DEFAULT_RECENT_JOB; + } + } + if ((tmp_ptr = strstr(sched_params, "upper_threshold="))) { upper_threshold = atoi(tmp_ptr + 16); if (upper_threshold < 1) { @@ -230,9 +240,10 @@ static void _load_config(void) if (debug_flag & DEBUG_FLAG_POWER) { info("PowerParameters=balance_interval=%d,capmc_path=%s," "cap_watts=%u,decrease_rate=%u,increase_rate=%u," - "lower_threashold=%u,upper_threshold=%u", + "lower_threashold=%u,recent_job=%u,upper_threshold=%u", balance_interval, capmc_path, cap_watts, decrease_rate, - increase_rate,lower_threshold, upper_threshold); + increase_rate, lower_threshold, recent_job, + upper_threshold); } } @@ -352,6 +363,24 @@ static void _json_parse_object(json_object *jobj, power_config_nodes_t *ent) } } +static void _my_sleep(int add_secs) +{ + struct timespec ts = {0, 0}; + struct timeval tv = {0, 0}; + + if (gettimeofday(&tv, NULL)) { /* Some error */ + sleep(1); + return; + } + + ts.tv_sec = tv.tv_sec + add_secs; + ts.tv_nsec = tv.tv_usec * 1000; + pthread_mutex_lock(&term_lock); + if (!stop_power) + pthread_cond_timedwait(&term_cond, &term_lock, &ts); + pthread_mutex_unlock(&term_lock); +} + /* Periodically attempt to re-balance power caps across nodes */ extern void *_power_agent(void *args) { @@ -367,7 +396,7 @@ extern void *_power_agent(void *args) last_balance_time = time(NULL); while (!stop_power) { - sleep(1); + _my_sleep(1); if (stop_power) break; @@ -412,7 +441,7 @@ static List _rebalance_node_power(void) struct node_record *node_ptr, *node_ptr2; uint32_t alloc_power = 0, avail_power, ave_power, new_cap, tmp_u32; int node_power_raise_cnt = 0; - time_t recent = time(NULL) - DEFAULT_RECENT_JOB; + time_t recent = time(NULL) - recent_job; int i, j; /* Lower caps on under used nodes */ @@ -656,6 +685,22 @@ extern void fini(void) pthread_mutex_unlock(&thread_flag_mutex); } +/* Read the configuration file */ +extern void power_p_reconfig(void) +{ + pthread_mutex_lock(&thread_flag_mutex); + _load_config(); + if (cap_watts == 0) + _stop_power_agent(); + pthread_mutex_unlock(&thread_flag_mutex); +} + +/* Note that a suspended job has been resumed */ +extern void power_p_job_resume(struct job_record *job_ptr) +{ + set_node_new_job(job_ptr, node_record_table_ptr); +} + /* Note that a job has been allocated resources and is ready to start */ extern void power_p_job_start(struct job_record *job_ptr) { diff --git a/src/plugins/power/none/power_none.c b/src/plugins/power/none/power_none.c index f39a2653c80..933bc927780 100644 --- a/src/plugins/power/none/power_none.c +++ b/src/plugins/power/none/power_none.c @@ -99,6 +99,18 @@ extern int fini(void) return SLURM_SUCCESS; } +/* Read the configuration file */ +extern void power_p_reconfig(void) +{ + return; +} + +/* Note that a suspended job has been resumed */ +extern void power_p_job_resume(struct job_record *job_ptr) +{ + return; +} + /* Note that a job has been allocated resources and is ready to start */ extern void power_p_job_start(struct job_record *job_ptr) { diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index a14002fcb53..311aff7c2d2 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -71,6 +71,7 @@ #include "src/common/hostlist.h" #include "src/common/node_select.h" #include "src/common/parse_time.h" +#include "src/common/power.h" #include "src/common/slurm_accounting_storage.h" #include "src/common/slurm_jobcomp.h" #include "src/common/slurm_priority.h" @@ -12434,6 +12435,7 @@ static int _job_suspend(struct job_record *job_ptr, uint16_t op, bool indf_susp) if (!IS_JOB_SUSPENDED(job_ptr)) return ESLURM_JOB_NOT_SUSPENDED; rc = _resume_job_nodes(job_ptr, indf_susp); + power_g_job_resume(job_ptr); if (rc != SLURM_SUCCESS) return rc; _suspend_job(job_ptr, op, indf_susp); diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 74d86eded27..911d0f8031e 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -67,14 +67,15 @@ #include "src/common/macros.h" #include "src/common/node_select.h" #include "src/common/parse_spec.h" +#include "src/common/power.h" #include "src/common/read_config.h" #include "src/common/slurm_jobcomp.h" #include "src/common/slurm_topology.h" #include "src/common/slurm_rlimits_info.h" #include "src/common/slurm_route.h" +#include "src/common/strnatcmp.h" #include "src/common/switch.h" #include "src/common/xstring.h" -#include "src/common/strnatcmp.h" #include "src/slurmctld/acct_policy.h" #include "src/slurmctld/burst_buffer.h" @@ -1006,6 +1007,7 @@ int read_slurm_conf(int recover, bool reconfig) rehash_node(); slurm_topo_build_config(); route_g_reconfigure(); + power_g_reconfig(); cpu_freq_reconfig(); rehash_jobs(); -- GitLab