diff --git a/RELEASE_NOTES b/RELEASE_NOTES index bb8224975a06c447b62856e22a0319c1ddb05dfc..655ec5930b86ff7e732a128388c8a3a84b9a7dcc 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -45,6 +45,8 @@ HIGHLIGHTS to have all the limits a QOS has. If a limit is set in both QOS the partition QOS will override the job's QOS unless the job's QOS has the 'PartitionQOS' flag set. + -- Expanded --cpu-freq parameters to include min-max:governor specifications. + --cpu-freq now supported on salloc and sbatch. RPMBUILD CHANGES ================ @@ -67,6 +69,9 @@ CONFIGURATION FILE CHANGES (see man appropriate man page for details) between the user application and srun when the user application terminates. -- Remove the CR_ALLOCATE_FULL_SOCKET configuration option. It is now the default. + -- Added DebugFlags value of "CpuFrequency". + -- Added CpuFreqGovernors which lists governors allowed to be set with + --cpu-freq on salloc, sbatch, and srun. DBD CONFIGURATION FILE CHANGES (see "man slurmdbd.conf" for details) ==================================================================== @@ -74,6 +79,7 @@ DBD CONFIGURATION FILE CHANGES (see "man slurmdbd.conf" for details) COMMAND CHANGES (see man pages for details) =========================================== + -- Added "--cpu_freq" option to salloc and sbatch. -- Add sbcast support for file transfer to resources allocated to a job step rather than a job allocation (e.g. "sbcast -j 123.4 ..."). -- Added new job state of STOPPED indicating processes have been stopped with a @@ -94,14 +100,39 @@ API CHANGES Changed members of the following structs ======================================== + -- Changed the following fields to struct struct job_descriptor + cpu_freq renamed cpu_freq_max. + -- Changed the following fields to struct job_info + cpu_freq renamed cpu_freq_max. + -- Changed the following fields to struct slurm_step_ctx_params_t + cpu_freq renamed cpu_freq_max. + -- Changed the following fields to struct slurm_step_launch_params_t + cpu_freq renamed cpu_freq_max. + -- Changed the following fields to struct job_step_info_t + cpu_freq renamed cpu_freq_max. + -- Changed the following fields to struct resource_allocation_response_msg_t + cpu_freq renamed cpu_freq_max. Added the following struct definitions ====================================== + -- Added the following fields to struct struct job_descriptor + cpu_freq_min, cpu_freq_gov. + -- Added the following fields to struct job_info + cpu_freq_min, cpu_freq_gov. + -- Added the following fields to struct slurm_step_ctx_params_t + cpu_freq_min, cpu_freq_gov. + -- Added the following fields to struct slurm_step_launch_params_t + cpu_freq_min, cpu_freq_gov. + -- Added the following fields to struct job_step_info_t + cpu_freq_min, cpu_freq_gov. + -- Added the following fields to struct resource_allocation_response_msg_t + cpu_freq_min, cpu_freq_gov. Added the following struct definitions ====================================== +CPU_FREQ_GOV_MASK -- mask for all defined cpu-frequency governors. Changed the following enums and #defines @@ -129,6 +160,5 @@ Added the following struct definitions Added the following enums and #defines ======================================== - Added the following API's ========================= diff --git a/contribs/perlapi/libslurm/perl/alloc.c b/contribs/perlapi/libslurm/perl/alloc.c index f1fa3a2c2fc6cacb1f7a26543576daadbc6c5260..1d3c0dd216f19313e8959b8f2695adbf5050184b 100644 --- a/contribs/perlapi/libslurm/perl/alloc.c +++ b/contribs/perlapi/libslurm/perl/alloc.c @@ -151,6 +151,9 @@ hv_to_job_desc_msg(HV *hv, job_desc_msg_t *job_desc) FETCH_FIELD(hv, job_desc, warn_time, uint16_t, FALSE); FETCH_FIELD(hv, job_desc, work_dir, charp, FALSE); /* job constraints: */ + FETCH_FIELD(hv, job_desc, cpu_freq_min, uint32_t, FALSE); + FETCH_FIELD(hv, job_desc, cpu_freq_max, uint32_t, FALSE); + FETCH_FIELD(hv, job_desc, cpu_freq_gov, uint32_t, FALSE); FETCH_FIELD(hv, job_desc, cpus_per_task, uint16_t, FALSE); FETCH_FIELD(hv, job_desc, min_cpus, uint32_t, FALSE); FETCH_FIELD(hv, job_desc, max_cpus, uint32_t, FALSE); diff --git a/contribs/perlapi/libslurm/perl/step_ctx.c b/contribs/perlapi/libslurm/perl/step_ctx.c index ca96e9a2bb114a07d863434fdc62257e66f6e59e..5966c0ee39175393cd197c9d145e0df7e44c652e 100644 --- a/contribs/perlapi/libslurm/perl/step_ctx.c +++ b/contribs/perlapi/libslurm/perl/step_ctx.c @@ -20,7 +20,9 @@ hv_to_slurm_step_ctx_params(HV *hv, slurm_step_ctx_params_t *params) FETCH_FIELD(hv, params, ckpt_interval, uint16_t, FALSE); FETCH_FIELD(hv, params, cpu_count, uint32_t, FALSE); - FETCH_FIELD(hv, params, cpu_freq, uint32_t, FALSE); + FETCH_FIELD(hv, params, cpu_freq_min, uint32_t, FALSE); + FETCH_FIELD(hv, params, cpu_freq_max, uint32_t, FALSE); + FETCH_FIELD(hv, params, cpu_freq_gov, uint32_t, FALSE); FETCH_FIELD(hv, params, exclusive, uint16_t, FALSE); FETCH_FIELD(hv, params, features, charp, FALSE); FETCH_FIELD(hv, params, immediate, uint16_t, FALSE); @@ -190,7 +192,9 @@ hv_to_slurm_step_launch_params(HV *hv, slurm_step_launch_params_t *params) FETCH_FIELD(hv, params, task_epilog, charp, FALSE); FETCH_FIELD(hv, params, cpu_bind_type, uint16_t, FALSE); FETCH_FIELD(hv, params, cpu_bind, charp, FALSE); - FETCH_FIELD(hv, params, cpu_freq, uint32_t, FALSE); + FETCH_FIELD(hv, params, cpu_freq_min, uint32_t, FALSE); + FETCH_FIELD(hv, params, cpu_freq_max, uint32_t, FALSE); + FETCH_FIELD(hv, params, cpu_freq_gov, uint32_t, FALSE); FETCH_FIELD(hv, params, mem_bind_type, uint16_t, FALSE); FETCH_FIELD(hv, params, mem_bind, charp, FALSE); diff --git a/doc/man/man1/sacct.1 b/doc/man/man1/sacct.1 index 6faf19c01a68d650a12c8383c1c076d87a61dfd7..a27ce05b5679f4946cfa45d05617ca1489326e7f 100644 --- a/doc/man/man1/sacct.1 +++ b/doc/man/man1/sacct.1 @@ -131,12 +131,13 @@ MaxPages MaxPagesNode MaxPagesTask MaxRSS MaxRSSNode MaxRSSTask MaxVMSize MaxVMSizeNode MaxVMSizeTask MinCPU MinCPUNode MinCPUTask NCPUS NNodes NodeList NTasks -Priority Partition QOSRAW ReqCPUFreq -ReqCPUs ReqMem Reservation ReservationId -Reserved ResvCPU ResvCPURAW Start -State Submit Suspended SystemCPU -Timelimit TotalCPU UID User -UserCPU WCKey WCKeyID +Priority Partition QOSRAW ReqCPUFreqMin +ReqCPUFreqMax ReqCPUFreqGov ReqCPUs ReqMem +Reservation ReservationId Reserved ResvCPU +ResvCPURAW Start State Submit +Suspended SystemCPU Timelimit TotalCPU +UID User UserCPU WCKey +WCKeyID .ft 1 .fi @@ -214,7 +215,7 @@ avevmsize,maxrss,maxrssnode,maxrsstask,averss,maxpages,maxpagesnode, maxpagestask,avepages,mincpu,mincpunode,mincputask,avecpu,ntasks, alloccpus,elapsed,state,exitcode,maxdiskread,maxdiskreadnode,maxdiskreadtask, avediskread,maxdiskwrite,maxdiskwritenode,maxdiskwritetask,avediskwrite, -allocgres,reqgres +allocgres,reqgres,avecpufreq,reqcpufreqmin,reqcpufreqmax,reqcpufreqgov .ad .TP diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 4811ff8b9a6f1bff411acc0f521e4001d1fbb0c1..78e0d1898049a888e41fba7917beb5633b0d17e0 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -243,6 +243,108 @@ Restrict node selection to nodes with at least the specified number of cores per socket. See additional information under \fB\-B\fR option above when task/affinity plugin is enabled. +.TP +\fB\-\-cpu\-freq\fR =<\fIp1\fR[-\fIp2\fR[:\fIp3\fR]]> + +Request that job steps initiated by srun commands inside this allocation +be run at some requested frequency if possible, on the CPUs selected +for the step on the compute node(s). + +\fBp1\fR can be [#### | low | medium | high | highm1] which will set the +frequency scaling_speed to the corresponding value, and set the frequency +scaling_governor to UserSpace. See below for definition of the values. + +\fBp1\fR can be [Conservative | OnDemand | Performance | PowerSave] which +will set the scaling_governor to the corresponding value. The governor has to be +in the list set by the slurm.conf option CpuFreqGovernors. + +When \fBp2\fR is present, p1 will be the minimum scaling frequency and +p2 will be the maximum scaling frequency. + +\fBp2\fR can be [#### | medium | high | highm1] p2 must be greater than p1. + +\fBp3\fR can be [Conservative | OnDemand | Performance | PowerSave | UserSpace] +which will set the governor to the corresponding value. + +If \fBp3\fR is UserSpace, the frequency scaling_speed will be set by a power +or energy aware scheduling strategy to a value between p1 and p2 that lets the +job run within the site's power goal. The job may be delayed if p1 is higher +than a frequency that allows the job to run withing the goal. + +If the current frequency is < min, it will be set to min. Likewise, +if the current frequency is > max, it will be set to max. + + +Acceptable values at present include: +.RS +.TP 14 +\fB####\fR +frequency in kilohertz +.TP +\fBLow\fR +the lowest available frequency +.TP +\fBHigh\fR +the highest available frequency +.TP +\fBHighM1\fR +(high minus one) will select the next highest available frequency +.TP +\fBMedium\fR +attempts to set a frequency in the middle of the available range +.TP +\fBConservative\fR +attempts to use the Conservative CPU governor +.TP +\fBOnDemand\fR +attempts to use the OnDemand CPU governor (the default value) +.TP +\fBPerformance\fR +attempts to use the Performance CPU governor +.TP +\fBPowerSave\fR +attempts to use the PowerSave CPU governor +.TP +\fBUserSpace\fR +attempts to use the UserSpace CPU governor + +.RE + +The following informational environment variable is set in the job +step when \fB\-\-cpu\-freq\fR option is requested. +.nf + SLURM_CPU_FREQ_REQ +.fi + +This environment variable can also be used to supply the value for the +CPU frequency request if it is set when the 'srun' command is issued. +The \fB\-\-cpu\-freq\fR on the command line will override the +environment variable value. The form on the environment variable is +the same as the command line. +See the \fBENVIRONMENT VARIABLES\fR +section for a description of the SLURM_CPU_FREQ_REQ variable. + +\fBNOTE\fR: This parameter is treated as a request, not a requirement. +If the job step's node does not support setting the CPU frequency, or +the requested value is outside the bounds of the legal frequencies, an +error is logged, but the job step is allowed to continue. + +\fBNOTE\fR: Setting the frequency for just the CPUs of the job step +implies that the tasks are confined to those CPUs. If task +confinement (i.e., TaskPlugin=task/affinity or +TaskPlugin=task/cgroup with the "ConstrainCores" option) is not +configured, this parameter is ignored. + +\fBNOTE\fR: When the step completes, the frequency and governor of each +selected CPU is reset to the configured \fBCpuFreqDef\fR value with a +default value of the OnDemand CPU governor. + +\fBNOTE\fR: When submitting jobs with the \fB\-\-cpu\-freq\fR option +with linuxproc as the ProctrackType can cause jobs to run too quickly before +Accounting is able to poll for job information. As a result not all of +accounting information will be present. +.RE + .TP \fB\-c\fR, \fB\-\-cpus\-per\-task\fR=<\fIncpus\fR> Advise the SLURM controller that ensuing job steps will require \fIncpus\fR diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 7d8cb880d84683f3556dcbae7dd41b0af57a7c8f..1b9d1a3111242ed45b44ca99a8fda4a470d13c55 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -272,6 +272,108 @@ Restrict node selection to nodes with at least the specified number of cores per socket. See additional information under \fB\-B\fR option above when task/affinity plugin is enabled. +.TP +\fB\-\-cpu\-freq\fR =<\fIp1\fR[-\fIp2\fR[:\fIp3\fR]]> + +Request that job steps initiated by srun commands inside this sbatch script +be run at some requested frequency if possible, on the CPUs selected +for the step on the compute node(s). + +\fBp1\fR can be [#### | low | medium | high | highm1] which will set the +frequency scaling_speed to the corresponding value, and set the frequency +scaling_governor to UserSpace. See below for definition of the values. + +\fBp1\fR can be [Conservative | OnDemand | Performance | PowerSave] which +will set the scaling_governor to the corresponding value. The governor has to be +in the list set by the slurm.conf option CpuFreqGovernors. + +When \fBp2\fR is present, p1 will be the minimum scaling frequency and +p2 will be the maximum scaling frequency. + +\fBp2\fR can be [#### | medium | high | highm1] p2 must be greater than p1. + +\fBp3\fR can be [Conservative | OnDemand | Performance | PowerSave | UserSpace] +which will set the governor to the corresponding value. + +If \fBp3\fR is UserSpace, the frequency scaling_speed will be set by a power +or energy aware scheduling strategy to a value between p1 and p2 that lets the +job run within the site's power goal. The job may be delayed if p1 is higher +than a frequency that allows the job to run withing the goal. + +If the current frequency is < min, it will be set to min. Likewise, +if the current frequency is > max, it will be set to max. + + +Acceptable values at present include: +.RS +.TP 14 +\fB####\fR +frequency in kilohertz +.TP +\fBLow\fR +the lowest available frequency +.TP +\fBHigh\fR +the highest available frequency +.TP +\fBHighM1\fR +(high minus one) will select the next highest available frequency +.TP +\fBMedium\fR +attempts to set a frequency in the middle of the available range +.TP +\fBConservative\fR +attempts to use the Conservative CPU governor +.TP +\fBOnDemand\fR +attempts to use the OnDemand CPU governor (the default value) +.TP +\fBPerformance\fR +attempts to use the Performance CPU governor +.TP +\fBPowerSave\fR +attempts to use the PowerSave CPU governor +.TP +\fBUserSpace\fR +attempts to use the UserSpace CPU governor + +.RE + +The following informational environment variable is set in the job +step when \fB\-\-cpu\-freq\fR option is requested. +.nf + SLURM_CPU_FREQ_REQ +.fi + +This environment variable can also be used to supply the value for the +CPU frequency request if it is set when the 'srun' command is issued. +The \fB\-\-cpu\-freq\fR on the command line will override the +environment variable value. The form on the environment variable is +the same as the command line. +See the \fBENVIRONMENT VARIABLES\fR +section for a description of the SLURM_CPU_FREQ_REQ variable. + +\fBNOTE\fR: This parameter is treated as a request, not a requirement. +If the job step's node does not support setting the CPU frequency, or +the requested value is outside the bounds of the legal frequencies, an +error is logged, but the job step is allowed to continue. + +\fBNOTE\fR: Setting the frequency for just the CPUs of the job step +implies that the tasks are confined to those CPUs. If task +confinement (i.e., TaskPlugin=task/affinity or +TaskPlugin=task/cgroup with the "ConstrainCores" option) is not +configured, this parameter is ignored. + +\fBNOTE\fR: When the step completes, the frequency and governor of each +selected CPU is reset to the configured \fBCpuFreqDef\fR value with a +default value of the OnDemand CPU governor. + +\fBNOTE\fR: When submitting jobs with the \fB\-\-cpu\-freq\fR option +with linuxproc as the ProctrackType can cause jobs to run too quickly before +Accounting is able to poll for job information. As a result not all of +accounting information will be present. +.RE + .TP \fB\-c\fR, \fB\-\-cpus\-per\-task\fR=<\fIncpus\fR> Advise the SLURM controller that ensuing job steps will require \fIncpus\fR diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 49ae1f894994eec8279e84c33ba8e23f37ac047f..f9ed1916e93bd8c209f6f839d328d78f7fedf392 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -383,14 +383,43 @@ Show help message for cpu_bind .RE .TP -\fB\-\-cpu\-freq\fR =<\fIrequested frequency in kilohertz\fR> +\fB\-\-cpu\-freq\fR =<\fIp1\fR[-\fIp2\fR[:\fIp3\fR]]> -Request that the job step initiated by this srun command be run at the +Request that the job step initiated by this srun command be run at some requested frequency if possible, on the CPUs selected for the step on the compute node(s). + +\fBp1\fR can be [#### | low | medium | high | highm1] which will set the +frequency scaling_speed to the corresponding value, and set the frequency +scaling_governor to UserSpace. See below for definition of the values. + +\fBp1\fR can be [Conservative | OnDemand | Performance | PowerSave] which +will set the scaling_governor to the corresponding value. The governor has to be +in the list set by the slurm.conf option CpuFreqGovernors. + +When \fBp2\fR is present, p1 will be the minimum scaling frequency and +p2 will be the maximum scaling frequency. + +\fBp2\fR can be [#### | medium | high | highm1] p2 must be greater than p1. + +\fBp3\fR can be [Conservative | OnDemand | Performance | PowerSave | UserSpace] +which will set the governor to the corresponding value. + +If \fBp3\fR is UserSpace, the frequency scaling_speed will be set by a power +or energy aware scheduling strategy to a value between p1 and p2 that lets the +job run within the site's power goal. The job may be delayed if p1 is higher +than a frequency that allows the job to run withing the goal. + +If the current frequency is < min, it will be set to min. Likewise, +if the current frequency is > max, it will be set to max. + + Acceptable values at present include: .RS .TP 14 +\fB####\fR +frequency in kilohertz +.TP \fBLow\fR the lowest available frequency .TP @@ -414,6 +443,10 @@ attempts to use the Performance CPU governor .TP \fBPowerSave\fR attempts to use the PowerSave CPU governor +.TP +\fBUserSpace\fR +attempts to use the UserSpace CPU governor + .RE The following informational environment variable is set in the job @@ -425,7 +458,9 @@ step when \fB\-\-cpu\-freq\fR option is requested. This environment variable can also be used to supply the value for the CPU frequency request if it is set when the 'srun' command is issued. The \fB\-\-cpu\-freq\fR on the command line will override the -environment variable value. See the \fBENVIRONMENT VARIABLES\fR +environment variable value. The form on the environment variable is +the same as the command line. +See the \fBENVIRONMENT VARIABLES\fR section for a description of the SLURM_CPU_FREQ_REQ variable. \fBNOTE\fR: This parameter is treated as a request, not a requirement. @@ -448,6 +483,8 @@ with linuxproc as the ProctrackType can cause jobs to run too quickly before Accounting is able to poll for job information. As a result not all of accounting information will be present. +.RE + .TP \fB\-c\fR, \fB\-\-cpus\-per\-task\fR=<\fIncpus\fR> Request that \fIncpus\fR be allocated \fBper process\fR. This may be diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index d5669c42f8496261e31ba8d853c569617d8ab84b..75fd6f61013d1ae74247a96d3c93b14ab629ef21 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -494,6 +494,29 @@ attempts to use the Performance CPU governor attempts to use the PowerSave CPU governor .RE +.TP +\fBCpuFreqGovernors\fR +List of CPU frequency governors allowed to be set with --cpu-freq +on salloc, sbatch, or srun. +Acceptable values at present include: +.RS +.TP +\fBConservative\fR +attempts to use the Conservative CPU governor +.TP +\fBOnDemand\fR +attempts to use the OnDemand CPU governor (the default value) +.TP +\fBPerformance\fR +attempts to use the Performance CPU governor +.TP +\fBPowerSave\fR +attempts to use the PowerSave CPU governor +.TP +\fBUserSpace\fR +attempts to use the UserSpace CPU governor +.RE +The default is OnDemand. .TP \fBCryptoType\fR The cryptographic signature tool to be used in the creation of @@ -540,6 +563,9 @@ Burst Buffer plugin \fBCPU_Bind\fR CPU binding details for jobs and steps .TP +\fBCpuFrequency\fR +Cpu Frequency details for jobs and steps with --cpu-freq=... +.TP \fBDB_ASSOC\fR SQL statements/queries when dealing with associations in the database. .TP diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 9c9e87262ae30354aca51209e9e627d7f98e3723..aecef94cfb2f6871f080b6803e04d537f4ffa7f2 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -59,6 +59,7 @@ #include <unistd.h> #include "src/common/assoc_mgr.h" +#include "src/common/cpu_frequency.h" #include "src/common/gres.h" #include "src/common/hostlist.h" #include "src/common/layouts_mgr.h" @@ -997,6 +998,7 @@ int read_slurm_conf(int recover, bool reconfig) rehash_node(); slurm_topo_build_config(); route_g_reconfigure(); + cpu_freq_reconfig(); rehash_jobs(); set_slurmd_addr(); diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index cb893eb91595cb7fa501c0df1cd6e662293d9fdb..7d716fc3447f6debb55fe127feb043f319e731de 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -995,6 +995,7 @@ _reconfigure(void) slurm_topo_build_config(); _set_topo_info(); route_g_reconfigure(); + cpu_freq_reconfig(); /* * In case the administrator changed the cpu frequency set capabilities diff --git a/testsuite/expect/README b/testsuite/expect/README index 9687c124d3f850f10a191617bdae2019ef2c72a3..412552bbed6a9800272c2d546bc880d430ab0391 100644 --- a/testsuite/expect/README +++ b/testsuite/expect/README @@ -177,6 +177,7 @@ test1.72 Validate JobAcctGatherFrequency configuration parameter is enforced. test1.73 Validate OverTimeLimit configuration parameter is enforced. test1.74 Validate MaxNode and GrpNode limit enforcment with QoS & association. test1.75 Test that --cpu-freq is enforced when using non-numeric values +test1.76 Test that --cpu-freq sets min-max:gov **NOTE** The following tests attempt to utilize multiple CPUs or partitions, The test will print "WARNING" and terminate with an exit code of