diff --git a/NEWS b/NEWS index 3b9ddb4962b4d5715f9e396b34717015e9ccc6ee..415e8796285fa01b0cb3937fea5b58805c061cb7 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,8 @@ documents those changes that are of interest to users and admins. 16 psets defined in dynamic layout mode. -- Improve srun_cr handling of child srun forking. Patch from Hongjia Cao, NUDT. + -- Configuration parameter ResumeDelay replaced by SuspendTimeout and + ResumeTimeout. * Changes in SLURM 2.0.0-rc2 ============================ diff --git a/doc/html/configurator.html.in b/doc/html/configurator.html.in index ae3c751b719d7b7ba81d48ab15c0572b483ca670..1c7c823e4a277f90cebcc5ab2f79b451a699a522 100644 --- a/doc/html/configurator.html.in +++ b/doc/html/configurator.html.in @@ -269,7 +269,8 @@ function displayfile() "# POWER SAVE SUPPORT FOR IDLE NODES (optional) <br>" + "#SuspendProgram= <br>" + "#ResumeProgram= <br>" + - "#ResumeDelay= <br>" + + "#SuspendTimeout= <br>" + + "#ResumeTimeout= <br>" + "#ResumeRate= <br>" + "#SuspendExcNodes= <br>" + "#SuspendExcParts= <br>" + diff --git a/doc/html/power_save.shtml b/doc/html/power_save.shtml index d25d84596ca6d087c91ceb77769a109e801eff56..05344d477e977352de5434bfabc2605922cb10c8 100644 --- a/doc/html/power_save.shtml +++ b/doc/html/power_save.shtml @@ -38,14 +38,6 @@ A value of zero results in no limits being imposed. The default value is 60. Use this to prevent rapid drops in power requirements.</li> -<li><b>ResumeDelay</b>: -Minimum delay between when a node is suspended before attempting -to resume it (e.g. power it back up). -This value should exceed the time required for <b>SuspendProgram</b> -to fully suspend a node. -Its value should be no larger than <b>SlurmdTimeout</b>. -The default value is 30 seconds. - <li><b>ResumeRate</b>: Maximum number of nodes to be removed from power saving mode per minute. @@ -79,6 +71,20 @@ For reasons of reliability, <b>ResumeProgram</b> may execute more than once for a node when the <b>slurmctld</b> daemon crashes and is restarted.</li> +<li><b>SuspendTimeout</b>: +Maximum time permitted (in second) between when a node suspend request +is issued and when the node shutdown is complete. +At that time the node must ready for a resume request to be issued +as needed for new workload. +The default value is 30 seconds.</li> + +<li><b>ResumeTimeout</b>: +Maximum time permitted (in second) between when a node resume request +is issued and when the node is actually available for use. +Nodes which fail to respond in this time frame may be marked DOWN and +the jobs scheduled on the node requeued. +The default value is 60 seconds.</li> + <li><b>SuspendExcNodes</b>: List of nodes to never place in power saving mode. Use SLURM's hostlist expression format. @@ -160,7 +166,8 @@ impact of power saving mode before enabling it.</p> <h2>Fault tolerance</h2> <p>If the <i>slurmctld</i> daemon is terminated gracefully, it will -wait up to <b>ResumeDelay</b> for any spawned <b>SuspendProgram</b> or +wait up to <b>SuspendTimeout</b> or <b>ResumeTimeout</b> (whichever +is larger) for any spawned <b>SuspendProgram</b> or <b>ResumeProgram</b> to terminate before the daemon terminates. If the spawned program does not terminate within that time period, the event will be logged and <i>slurmctld</i> will exit in order to @@ -181,6 +188,6 @@ In order to minimize this risk, when the <i>slurmctld</i> daemon is started and node which should be allocated to a job fails to respond, the <b>ResumeProgram</b> will be executed (possibly for a second time).</p> -<p style="text-align:center;">Last modified 14 May 2009</p> +<p style="text-align:center;">Last modified 18 May 2009</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index d4243091f33fa2958f5138602a777305127cc33b..78ddbb16e462c74288c5739519d43867d2c2b55c 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -988,20 +988,6 @@ appearing in this list. The user can override this by specifying which resource limits to propagate with the srun commands "\-\-propagate" option. See \fBPropagateResourceLimits\fR above for a list of valid limit names. -.TP -\fBResumeDelay\fR -Minimum time (in second) between when a node is suspended before attempting -to resume it (e.g. power it back up). -This value should exceed the time required for \fBSuspendProgram\fR to fully -suspend a node. -Its value should be no larger than \fBSlurmdTimeout\fR. -The default value is 30 seconds. -Related configuration options include \fBResumeProgram\fR, \fBResumeRate\fR, -\fBSuspendRate\fR, \fBSuspendTime\fR, \fBSuspendProgram\fR, \fBSuspendExcNodes\fR, -and \fBSuspendExcParts\fR. -More information is available at the SLURM web site -(https://computing.llnl.gov/linux/slurm/power_save.html). - .TP \fBResumeProgram\fR SLURM supports a mechanism to reduce power consumption on nodes that @@ -1019,9 +1005,9 @@ The argument to the program will be the names of nodes to be removed from power savings mode (using SLURM's hostlist expression format). By default no program is run. -Related configuration options include \fBResumeDelay\fR, \fBResumeRate\fR, -\fBSuspendRate\fR, \fBSuspendTime\fR, \fBSuspendProgram\fR, \fBSuspendExcNodes\fR, -and \fBSuspendExcParts\fR. +Related configuration options include \fBResumeTimeout\fR, \fBResumeRate\fR, +\fBSuspendRate\fR, \fBSuspendTime\fR, \fBResumeTimeout\fR, \fBSuspendProgram\fR, +\fBSuspendExcNodes\fR, and \fBSuspendExcParts\fR. More information is available at the SLURM web site (https://computing.llnl.gov/linux/slurm/power_save.html). @@ -1034,9 +1020,22 @@ power surges if a large number of nodes in power save mode are assigned work at the same time (e.g. a large job starts). A value of zero results in no limits being imposed. The default value is 300 nodes per minute. -Related configuration options include \fBResumeDelay\fR, \fBResumeProgram\fR, -\fBSuspendRate\fR, \fBSuspendTime\fR, \fBSuspendProgram\fR, \fBSuspendExcNodes\fR, -and \fBSuspendExcParts\fR. +Related configuration options include \fBResumeTimeout\fR, \fBResumeProgram\fR, +\fBSuspendRate\fR, \fBSuspendTime\fR, \fBResumeTimeout\fR, \fBSuspendProgram\fR, +\fBSuspendExcNodes\fR, and \fBSuspendExcParts\fR. + +.TP +\fBResumeTimeout\fR +Maximum time permitted (in second) between when a node is resume request +is issued and when the node is actually available for use. +Nodes which fail to respond in this time frame may be marked DOWN and +the jobs scheduled on the node requeued. +The default value is 60 seconds. +Related configuration options include \fBResumeProgram\fR, \fBResumeRate\fR, +\fBSuspendRate\fR, \fBSuspendTime\fR, \fBSuspendTimeout\fR, \fBSuspendProgram\fR, +\fBSuspendExcNodes\fR and \fBSuspendExcParts\fR. +More information is available at the SLURM web site +(https://computing.llnl.gov/linux/slurm/power_save.html). .TP \fBResvOverRun\fR @@ -1405,9 +1404,9 @@ Specifies the nodes which are to not be placed in power save mode, even if the node remains idle for an extended period of time. Use SLURM's hostlist expression to identify nodes. By default no nodes are excluded. -Related configuration options include \fBResumeDelay\fR, \fBResumeProgram\fR, -\fBResumeRate\fR, \fBSuspendProgram\fR, \fBSuspendRate\fR, \fBSuspendTime\fR -and \fBSuspendExcParts\fR. +Related configuration options include \fBResumeTimeout\fR, \fBResumeProgram\fR, +\fBResumeRate\fR, \fBSuspendProgram\fR, \fBSuspendRate\fR, \fBSuspendTime\fR, +\fBResumeTimeout\fR, and \fBSuspendExcParts\fR. .TP \fBSuspendExcParts\fR @@ -1415,9 +1414,9 @@ Specifies the partitions whose nodes are to not be placed in power save mode, even if the node remains idle for an extended period of time. Multiple partitions can be identified and separated by commas. By default no nodes are excluded. -Related configuration options include \fBResumeDelay\fR, \fBResumeProgram\fR, +Related configuration options include \fBResumeTimeout\fR, \fBResumeProgram\fR, \fBResumeRate\fR, \fBSuspendProgram\fR, \fBSuspendRate\fR, \fBSuspendTime\fR -and \fBSuspendExcNodes\fR. +\fBResumeTimeout\fR, and \fBSuspendExcNodes\fR. .TP \fBSuspendProgram\fR @@ -1431,9 +1430,9 @@ The argument to the program will be the names of nodes to be placed into power savings mode (using SLURM's hostlist expression format). By default, no program is run. -Related configuration options include \fBResumeDelay\fR, \fBResumeProgram\fR, -\fBResumeRate\fR, \fBSuspendRate\fR, \fBSuspendTime\fR, \fBSuspendExcNodes\fR, -and \fBSuspendExcParts\fR. +Related configuration options include \fBResumeTimeout\fR, \fBResumeProgram\fR, +\fBResumeRate\fR, \fBSuspendRate\fR, \fBSuspendTime\fR, \fBResumeTimeout\fR, +\fBSuspendExcNodes\fR, and \fBSuspendExcParts\fR. .TP \fBSuspendRate\fR @@ -1442,20 +1441,31 @@ The value is number of nodes per minute and it can be used to prevent a large drop in power power consumption (e.g. after a large job completes). A value of zero results in no limits being imposed. The default value is 60 nodes per minute. -Related configuration options include \fBResumeDelay\fR, \fBResumeProgram\fR, -\fBResumeRate\fR, \fBSuspendProgram\fR, \fBSuspendTime\fR, \fBSuspendExcNodes\fR, -and \fBSuspendExcParts\fR. +Related configuration options include \fBResumeTimeout\fR, \fBResumeProgram\fR, +\fBResumeRate\fR, \fBSuspendProgram\fR, \fBSuspendTime\fR, \fBSuspendTimeout\fR, +\fBSuspendExcNodes\fR, and \fBSuspendExcParts\fR. .TP \fBSuspendTime\fR Nodes which remain idle for this number of seconds will be placed into power save mode by \fBSuspendProgram\fR. -The configured value should exceed the time to suspend and resume a node. -Also note the \fBResumeDelay\fR configuration parameter. A value of \-1 disables power save mode and is the default. -Related configuration options include \fBResumeDelay\fR, \fBResumeProgram\fR, -\fBResumeRate\fR, \fBSuspendProgram\fR, \fBSuspendRate\fR, \fBSuspendExcNodes\fR, -and \fBSuspendExcParts\fR. +Related configuration options include \fBResumeTimeout\fR, \fBResumeProgram\fR, +\fBResumeRate\fR, \fBSuspendProgram\fR, \fBSuspendRate\fR, \fBSuspendTimeout\fR, +\fBSuspendExcNodes\fR, and \fBSuspendExcParts\fR. + +.TP +\fBSuspendTimeout\fR +Maximum time permitted (in second) between when a node suspend request +is issued and when the node shutdown. +At that time the node must ready for a resume request to be issued +as needed for new work. +The default value is 30 seconds. +Related configuration options include \fBResumeProgram\fR, \fBResumeRate\fR, +\fBResumeTimeout\fR, \fBSuspendRate\fR, \fBSuspendTime\fR, \fBSuspendProgram\fR, +\fBSuspendExcNodes\fR and \fBSuspendExcParts\fR. +More information is available at the SLURM web site +(https://computing.llnl.gov/linux/slurm/power_save.html). .TP \fBSwitchType\fR diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 14ebd9504d0a6e4cdb2c7a7ea3dd79614655ead5..95511f00f13891d7ba9f209dbbef00ebf6f07d39 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1162,10 +1162,10 @@ typedef struct slurm_ctl_conf { * be propagated */ char *propagate_rlimits;/* Propagate (all/specific) resource limits */ char *propagate_rlimits_except;/* Propagate all rlimits except these */ - uint16_t resume_delay; /* minimum time from suspend before attempting - * to resume a node */ char *resume_program; /* program to make nodes full power */ uint16_t resume_rate; /* nodes to make full power, per minute */ + uint16_t resume_timeout;/* time required in order to perform a node + * resume operation */ uint16_t resv_over_run; /* how long a running job can exceed * reservation time */ uint16_t ret2service; /* 1 return DOWN node to service at @@ -1211,6 +1211,8 @@ typedef struct slurm_ctl_conf { char *suspend_program; /* program to make nodes power saving */ uint16_t suspend_rate; /* nodes to make power saving, per minute */ uint16_t suspend_time; /* node idle for this long before power save mode */ + uint16_t suspend_timeout;/* time required in order to perform a node + * suspend operation */ char *switch_type; /* switch or interconnect type */ char *task_epilog; /* pathname of task launch epilog */ char *task_plugin; /* task launch plugin */ @@ -1227,6 +1229,9 @@ typedef struct slurm_ctl_conf { * they are considered "unkillable". */ uint16_t use_pam; /* enable/disable PAM support */ uint16_t wait_time; /* default job --wait time */ + uint16_t z_16; /* reserved for future use */ + uint32_t z_32; /* reserved for future use */ + char *z_char; /* reserved for future use */ } slurm_ctl_conf_t; typedef struct slurmd_status_msg { diff --git a/src/api/config_info.c b/src/api/config_info.c index a8b12e5f93217c79d0a67d9aa4ff43ab719b24bb..6cc7991cc7be3b528942ba39264a69651e6b9ac2 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -344,10 +344,10 @@ void slurm_print_ctl_conf ( FILE* out, slurm_ctl_conf_ptr->propagate_rlimits_except); fprintf(out, "ResumeProgram = %s\n", slurm_ctl_conf_ptr->resume_program); - fprintf(out, "ResumeDelay = %u sec\n", - slurm_ctl_conf_ptr->resume_delay); fprintf(out, "ResumeRate = %u nodes/min\n", slurm_ctl_conf_ptr->resume_rate); + fprintf(out, "ResumeTimeout = %u sec\n", + slurm_ctl_conf_ptr->resume_timeout); if (slurm_ctl_conf_ptr->resv_over_run == (uint16_t) INFINITE) fprintf(out, "ResvOverRun = UNLIMITED\n"); else { @@ -431,6 +431,8 @@ void slurm_print_ctl_conf ( FILE* out, slurm_ctl_conf_ptr->suspend_rate); fprintf(out, "SuspendTime = %d\n", ((int)slurm_ctl_conf_ptr->suspend_time - 1)); + fprintf(out, "SuspendTimeout = %u sec\n", + slurm_ctl_conf_ptr->suspend_timeout); fprintf(out, "SwitchType = %s\n", slurm_ctl_conf_ptr->switch_type); fprintf(out, "TaskEpilog = %s\n", diff --git a/src/common/read_config.c b/src/common/read_config.c index c29a5e26615058fa9c4f9b212e7cf771d4ce84e0..fc00a98264309235581528b63a54303d493104ce 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -217,9 +217,9 @@ s_p_options_t slurm_conf_options[] = { {"PropagatePrioProcess", S_P_UINT16}, {"PropagateResourceLimitsExcept", S_P_STRING}, {"PropagateResourceLimits", S_P_STRING}, - {"ResumeDelay", S_P_UINT16}, {"ResumeProgram", S_P_STRING}, {"ResumeRate", S_P_UINT16}, + {"ResumeTimeout", S_P_UINT16}, {"ResvOverRun", S_P_UINT16}, {"ReturnToService", S_P_UINT16}, {"SallocDefaultCommand", S_P_STRING}, @@ -253,6 +253,7 @@ s_p_options_t slurm_conf_options[] = { {"SuspendProgram", S_P_STRING}, {"SuspendRate", S_P_UINT16}, {"SuspendTime", S_P_LONG}, + {"SuspendTimeout", S_P_UINT16}, {"SwitchType", S_P_STRING}, {"TaskEpilog", S_P_STRING}, {"TaskProlog", S_P_STRING}, @@ -1427,7 +1428,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) ctl_conf_ptr->propagate_prio_process = (uint16_t) NO_VAL; xfree (ctl_conf_ptr->propagate_rlimits); xfree (ctl_conf_ptr->propagate_rlimits_except); - ctl_conf_ptr->resume_delay = (uint16_t) NO_VAL; + ctl_conf_ptr->resume_timeout = 0; xfree (ctl_conf_ptr->resume_program); ctl_conf_ptr->resume_rate = (uint16_t) NO_VAL; ctl_conf_ptr->resv_over_run = 0; @@ -1464,6 +1465,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->suspend_program); ctl_conf_ptr->suspend_rate = (uint16_t) NO_VAL; ctl_conf_ptr->suspend_time = (uint16_t) NO_VAL; + ctl_conf_ptr->suspend_timeout = 0; xfree (ctl_conf_ptr->switch_type); xfree (ctl_conf_ptr->task_epilog); xfree (ctl_conf_ptr->task_plugin); @@ -2195,11 +2197,11 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) s_p_get_uint16(&conf->resv_over_run, "ResvOverRun", hashtbl); - if (!s_p_get_uint16(&conf->resume_delay, "ResumeDelay", hashtbl)) - conf->resume_delay = DEFAULT_RESUME_DELAY; s_p_get_string(&conf->resume_program, "ResumeProgram", hashtbl); if (!s_p_get_uint16(&conf->resume_rate, "ResumeRate", hashtbl)) conf->resume_rate = DEFAULT_RESUME_RATE; + if (!s_p_get_uint16(&conf->resume_timeout, "ResumeTimeout", hashtbl)) + conf->resume_timeout = DEFAULT_RESUME_TIMEOUT; s_p_get_string(&conf->salloc_default_command, "SallocDefaultCommand", hashtbl); @@ -2341,6 +2343,8 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) conf->suspend_time = long_suspend_time + 1; else conf->suspend_time = 0; + if (!s_p_get_uint16(&conf->suspend_timeout, "SuspendTimeout", hashtbl)) + conf->suspend_timeout = DEFAULT_SUSPEND_TIMEOUT; /* see above for switch_type, order dependent */ diff --git a/src/common/read_config.h b/src/common/read_config.h index 240293739464ed27eaecef2857dba3862db6700f..f5f740a70a6702832f380fb766f7ec1a231c031a 100644 --- a/src/common/read_config.h +++ b/src/common/read_config.h @@ -98,8 +98,8 @@ extern char *default_plugstack; #define DEFAULT_PRIORITY_TYPE "priority/basic" #define DEFAULT_PROPAGATE_PRIO_PROCESS 0 #define DEFAULT_RETURN_TO_SERVICE 0 -#define DEFAULT_RESUME_DELAY 30 #define DEFAULT_RESUME_RATE 300 +#define DEFAULT_RESUME_TIMEOUT 60 #define DEFAULT_SAVE_STATE_LOC "/tmp" #define DEFAULT_SCHEDROOTFILTER 1 #define DEFAULT_SCHEDULER_PORT 7321 @@ -121,6 +121,7 @@ extern char *default_plugstack; #define DEFAULT_STORAGE_PORT 0 #define DEFAULT_SUSPEND_RATE 60 #define DEFAULT_SUSPEND_TIME 0 +#define DEFAULT_SUSPEND_TIMEOUT 30 #define DEFAULT_SWITCH_TYPE "switch/none" #define DEFAULT_TASK_PLUGIN "task/none" #define DEFAULT_TMP_FS "/tmp" diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 37c687465921c2dec78d8f5fada1ee8e170b7c28..d2aaea27e97c320d9f4dfad91fee14aeace78878 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1242,6 +1242,7 @@ void slurm_free_ctl_conf(slurm_ctl_conf_info_msg_t * config_ptr) xfree(config_ptr->tmp_fs); xfree(config_ptr->topology_plugin); xfree(config_ptr->unkillable_program); + xfree(config_ptr->z_char); xfree(config_ptr); } } diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 5496089ac302258b5e3ec0cddcef806045640843..cb240584c1e58ac0f6d640dda29fe5518587804c 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2918,9 +2918,9 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) packstr(build_ptr->propagate_rlimits, buffer); packstr(build_ptr->propagate_rlimits_except, buffer); - pack16(build_ptr->resume_delay, buffer); packstr(build_ptr->resume_program, buffer); pack16(build_ptr->resume_rate, buffer); + pack16(build_ptr->resume_timeout, buffer); pack16(build_ptr->resv_over_run, buffer); pack16(build_ptr->ret2service, buffer); @@ -2979,6 +2979,7 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) packstr(build_ptr->suspend_program, buffer); pack16(build_ptr->suspend_rate, buffer); pack16(build_ptr->suspend_time, buffer); + pack16(build_ptr->suspend_timeout, buffer); packstr(build_ptr->switch_type, buffer); packstr(build_ptr->task_epilog, buffer); @@ -2995,6 +2996,10 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) pack16(build_ptr->unkillable_timeout, buffer); pack16(build_ptr->wait_time, buffer); + + pack16(build_ptr->z_16, buffer); + pack32(build_ptr->z_32, buffer); + packstr(build_ptr->z_char, buffer); } static int @@ -3133,10 +3138,10 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** safe_unpackstr_xmalloc(&build_ptr->propagate_rlimits_except, &uint32_tmp, buffer); - safe_unpack16(&build_ptr->resume_delay, buffer); safe_unpackstr_xmalloc(&build_ptr->resume_program, &uint32_tmp, buffer); safe_unpack16(&build_ptr->resume_rate, buffer); + safe_unpack16(&build_ptr->resume_timeout, buffer); safe_unpack16(&build_ptr->resv_over_run, buffer); safe_unpack16(&build_ptr->ret2service, buffer); @@ -3207,6 +3212,7 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** &uint32_tmp, buffer); safe_unpack16(&build_ptr->suspend_rate, buffer); safe_unpack16(&build_ptr->suspend_time, buffer); + safe_unpack16(&build_ptr->suspend_timeout, buffer); safe_unpackstr_xmalloc(&build_ptr->switch_type, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&build_ptr->task_epilog, &uint32_tmp, buffer); @@ -3225,6 +3231,10 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** safe_unpack16(&build_ptr->wait_time, buffer); + safe_unpack16(&build_ptr->z_16, buffer); + safe_unpack32(&build_ptr->z_32, buffer); + safe_unpackstr_xmalloc(&build_ptr->z_char, &uint32_tmp, buffer); + return SLURM_SUCCESS; unpack_error: diff --git a/src/slurmctld/power_save.c b/src/slurmctld/power_save.c index 53627dec6e1321f230a07a6be238eb2f82f538e1..d4447b1211e3f87b0a11d45ca91e8a179582a6c5 100644 --- a/src/slurmctld/power_save.c +++ b/src/slurmctld/power_save.c @@ -64,13 +64,12 @@ #define MAX_SHUTDOWN_DELAY 120 /* seconds to wait for child procs * to exit after daemon shutdown * request, then orphan or kill proc */ -#define PROG_WARNING_TIME 60 /* log program run time if over this */ /* Records for tracking processes forked to suspend/resume nodes */ pid_t child_pid[PID_CNT]; /* pid of process */ time_t child_time[PID_CNT]; /* start time of process */ -int idle_time, suspend_rate, resume_delay, resume_rate; +int idle_time, suspend_rate, resume_timeout, resume_rate, suspend_timeout; char *suspend_prog = NULL, *resume_prog = NULL; char *exc_nodes = NULL, *exc_parts = NULL; time_t last_config = (time_t) 0, last_suspend = (time_t) 0; @@ -113,7 +112,7 @@ static void _do_power_work(void) resume_cnt *= rate; } - if (now > (last_suspend + resume_delay)) { + if (now > (last_suspend + suspend_timeout)) { /* ready to start another round of node suspends */ run_suspend = true; if (last_suspend) { @@ -148,7 +147,7 @@ static void _do_power_work(void) node_ptr->node_state &= (~NODE_STATE_POWER_SAVE); bit_clear(power_node_bitmap, i); node_ptr->node_state |= NODE_STATE_NO_RESPOND; - node_ptr->last_response = now + resume_delay; + node_ptr->last_response = now + resume_timeout; bit_set(wake_node_bitmap, i); } if (run_suspend && @@ -172,7 +171,7 @@ static void _do_power_work(void) last_suspend = now; } } - if ((now - last_log) > 600) { + if (((now - last_log) > 600) && (susp_total > 0)) { info("Power save mode: %d nodes", susp_total); last_log = now; } @@ -311,8 +310,9 @@ static pid_t _run_prog(char *prog, char *arg) * return the count of empty slots in the child_pid array */ static int _reap_procs(void) { - int empties = 0, delay, i, rc, status; + int empties = 0, delay, i, max_timeout, rc, status; + max_timeout = MAX(suspend_timeout, resume_timeout); for (i=0; i<PID_CNT; i++) { if (child_pid[i] == 0) { empties++; @@ -323,9 +323,9 @@ static int _reap_procs(void) continue; delay = difftime(time(NULL), child_time[i]); - if (delay > PROG_WARNING_TIME) { - debug("power_save: program %d ran for %d sec", - (int) child_pid[i], delay); + if (delay > max_timeout) { + info("power_save: program %d ran for %d sec", + (int) child_pid[i], delay); } rc = WEXITSTATUS(status); @@ -370,14 +370,15 @@ static int _kill_procs(void) static void _shutdown_power(void) { - int i, proc_cnt; + int i, proc_cnt, max_timeout; + max_timeout = MAX(suspend_timeout, resume_timeout); /* Try to avoid orphan processes */ for (i=0; ; i++) { proc_cnt = PID_CNT - _reap_procs(); if (proc_cnt == 0) /* all procs completed */ break; - if (i >= resume_delay) { + if (i >= max_timeout) { error("power_save: orphaning %d processes which are " "not terminating so slurmctld can exit", proc_cnt); @@ -411,12 +412,13 @@ static int _init_power_config(void) { slurm_ctl_conf_t *conf = slurm_conf_lock(); - last_config = slurmctld_conf.last_update; - idle_time = conf->suspend_time - 1; - suspend_rate = conf->suspend_rate; - resume_delay = conf->resume_delay; - resume_rate = conf->resume_rate; - slurmd_timeout = conf->slurmd_timeout; + last_config = slurmctld_conf.last_update; + idle_time = conf->suspend_time - 1; + suspend_rate = conf->suspend_rate; + resume_timeout = conf->resume_timeout; + resume_rate = conf->resume_rate; + slurmd_timeout = conf->slurmd_timeout; + suspend_timeout = conf->suspend_timeout; _clear_power_config(); if (conf->suspend_program) suspend_prog = xstrdup(conf->suspend_program); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index fbddd16a4ce23848e95bb1c63d0a36b99cf9ed6c..a07c8c0fd3b2e39f8c8bd9fe744b521d212daab0 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -495,9 +495,9 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->propagate_rlimits_except = xstrdup(conf-> propagate_rlimits_except); - conf_ptr->resume_delay = conf->resume_delay; conf_ptr->resume_program = xstrdup(conf->resume_program); conf_ptr->resume_rate = conf->resume_rate; + conf_ptr->resume_timeout = conf->resume_timeout; conf_ptr->resv_over_run = conf->resv_over_run; conf_ptr->ret2service = conf->ret2service; @@ -539,6 +539,7 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->suspend_program = xstrdup(conf->suspend_program); conf_ptr->suspend_rate = conf->suspend_rate; conf_ptr->suspend_time = conf->suspend_time; + conf_ptr->resume_timeout = conf->resume_timeout; conf_ptr->switch_type = xstrdup(conf->switch_type); conf_ptr->task_epilog = xstrdup(conf->task_epilog);