diff --git a/NEWS b/NEWS index b8f718093bf7b602abd93ec0135f30bb6c2e1ff2..7d5bd2da31dbb018855fedf9e3e9c299fdc61d6b 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,13 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 1.4.0-pre12 +============================== + -- Added support for hard time limit for associations with added option + to the slurm.conf PriorityUsageResetPeriod which is used to reset the + time used. (currently only available with the use of the + priority/multifactor plugin.) + * Changes in SLURM 1.4.0-pre11 ============================== -- Fix slurm.spec file for RPM build. diff --git a/doc/html/priority_multifactor.shtml b/doc/html/priority_multifactor.shtml index b0fbb0d90cf77c04fc22259d2afe0d6fd3a0d095..af9a5fcf993cb3ed38eab2d96bfb18655085fcbd 100644 --- a/doc/html/priority_multifactor.shtml +++ b/doc/html/priority_multifactor.shtml @@ -426,13 +426,32 @@ factor as it is currently configured.</P> <DT> PriorityDecayHalfLife <DD> This determines the contribution of historical usage on the composite usage value. The higher the number, the longer past usage - affects fair-share. The unit is a time string (i.e. min, min:sec, - hr:min:sec, days-hr:min:sec, or days-hr. The default value is 7-0 (7 days). + affects fair-share. If set to 0 no decay will be applied. This is helpful if + you want to enforce hard time limits per association. If set to 0 + PriorityUsageResetPeriod must be set to some interval. + The unit is a time string (i.e. min, hr:min:00, days-hr:min:00, or + days-hr. The default value is 7-0 (7 days). +<DT> PriorityUsageResetPeriod +<DD> At this interval the usage of associations will be reset to 0. + This is used if you want to enforce hard limits of time usage per + association. If PriorityDecayHalfLife is set to be 0 no decay will + happen and this is the only way to reset the usage accumulated by + running jobs. By default this is turned off and it is advised to + use the PriorityDecayHalfLife option to avoid not having anything + running on your cluster, but if your schema is set up to only allow + certain amounts of time on your system this is the way to do it. + The unit is a time string (i.e. min, hr:min:00, days-hr:min:00, or + days-hr. The default value is not set (turned off). + <DT> PriorityFavorSmall -<DD> A boolean that sets the polarity of the job size factor. The default setting is NO which results in larger node sizes having a larger job size factor. Setting this parameter to YES means that the smaller the job, the greater the job size factor will be. +<DD> A boolean that sets the polarity of the job size factor. The + default setting is NO which results in larger node sizes having a + larger job size factor. Setting this parameter to YES means that + the smaller the job, the greater the job size factor will be. <DT> PriorityMaxAge -<DD> Specifies the queue wait time at which the age factor maxes out. The unit is a time string (i.e. min, min:sec, - hr:min:sec, days-hr:min:sec, or days-hr. The default value is 7-0 (7 days). +<DD> Specifies the queue wait time at which the age factor maxes out. + The unit is a time string (i.e. min, hr:min:00, days-hr:min:00, or + days-hr. The default value is 7-0 (7 days). <DT> PriorityWeightAge <DD> An unsigned integer that scales the contribution of the age factor. <DT> PriorityWeightFairshare @@ -451,10 +470,15 @@ factor as it is currently configured.</P> <a name=configexample> <h2>Configuration Example</h2></a> -<P> The following are sample slurm.conf file settings for the Multi-factor Job Priority Plugin.</P> +<P> The following are sample slurm.conf file settings for the + Multi-factor Job Priority Plugin.</P> +<P> The first example is for running the plugin applying decay over + time to reduce usage. Hard limits can be used in this + configuration, but will have less effect since usage will decay + over time instead of having no decay over time.</P> <PRE> -# Activate the Multi-factor Job Priority Plugin +# Activate the Multi-factor Job Priority Plugin with decay PriorityType=priority/multifactor # 2 week half-life @@ -477,8 +501,37 @@ PriorityWeightPartition=1000 PriorityWeightQOS=0 # don't use the qos factor </PRE> +<P> This example is for running the plugin with no decay on usage, + thus making a reset of usage necessary.</P> +<PRE> +# Activate the Multi-factor Job Priority Plugin with decay +PriorityType=priority/multifactor + +# apply no decay +PriorityDecayHalfLife=0 + +# reset usage after 28 days +PriorityUsageResetPeriod=28-0 + +# The larger the job, the greater its job size priority. +PriorityFavorSmall=NO + +# The job's age factor reaches 1.0 after waiting in the +# queue for 2 weeks. +PriorityMaxAge=14-0 + +# This next group determines the weighting of each of the +# components of the Multi-factor Job Priority Plugin. +# The default value for each of the following is 1. +PriorityWeightAge=1000 +PriorityWeightFairshare=10000 +PriorityWeightJobSize=1000 +PriorityWeightPartition=1000 +PriorityWeightQOS=0 # don't use the qos factor +</PRE> + <!--------------------------------------------------------------------------> -<p style="text-align:center;">Last modified 11 February 2009</p> +<p style="text-align:center;">Last modified 08 April 2009</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man1/sacct.1 b/doc/man/man1/sacct.1 index cd82649dee810560997db252e069b6a01656c961..6ff80c535b7adcda5ddaaad5d9e9e41276c065c1 100644 --- a/doc/man/man1/sacct.1 +++ b/doc/man/man1/sacct.1 @@ -1,4 +1,4 @@ -.TH SACCT "1" "January 2009" "sacct 2.0" "Slurm components" +.TH SACCT "1" "April 2009" "sacct 2.0" "Slurm components" .SH "NAME" sacct \- displays accounting data for all jobs and job steps in the @@ -145,20 +145,20 @@ Print a list of fields that can be specified with the \f3\-\-format\fP option. .ft 3 Fields available: -AllocCPUS Account AssocID AveCPU -AvePages AveRSS AveVSize BlockID -Cluster CPUTime CPUTimeRAW Elapsed -Eligible End ExitCode GID -Group JobID JobName NodeList -MaxPages MaxPagesNode MaxPagesTask MaxRSS -MaxRSSNode MaxRSSTask MaxVSize MaxVSizeNode -MaxVSizeTask MinCPU MinCPUNode MinCPUTask -NCPUS NNodes NTasks Priority -Partition QOS QOSRAW ReqCPUS -Reserved ResvCPU ResvCPURAW Start -State Submit Suspended SystemCPU -Timelimit TotalCPU UID User -UserCPU WCKey WCKeyID +AllocCPUS Account AssocID AveCPU +AvePages AveRSS AveVMSize BlockID +Cluster CPUTime CPUTimeRAW Elapsed +Eligible End ExitCode GID +Group JobID JobName Layout +MaxPages MaxPagesNode MaxPagesTask MaxRSS +MaxRSSNode MaxRSSTask MaxVMSize MaxVMSizeNode +MaxVMSizeTask MinCPU MinCPUNode MinCPUTask +NCPUS NNodes NodeList NTasks +Priority Partition QOS QOSRAW +ReqCPUS Reserved ResvCPU ResvCPURAW +Start State Submit Suspended +SystemCPU Timelimit TotalCPU UID +User UserCPU WCKey WCKeyID .ft 1 .fi @@ -343,15 +343,16 @@ The following describes each job accounting field: .RS .TP "10" \f3alloccpus\fP -Allocated processors. +Count of allocated processors. .TP \f3account\fP -User supplied account number for the job. +Account the job ran under. .TP \f3associd\fP Reference to the association of user, account and cluster. + .TP \f3avecpu\fP Average CPU time of a process. @@ -366,6 +367,7 @@ Average resident set size of a process. .TP \f3avevsize\fP +Average Virtual Memory size of a process. .TP \f3blockid\fP @@ -377,12 +379,12 @@ Cluster name. .TP \f3cputime\fP -Minimum CPU time of any process followed by its task id along with -the average of all processes running in the step. +Formatted number of cpu seconds a process was allocated. .TP \f3cputimeraw\fP - +How much cpu time process was allocated in second format, not formatted +like above. .TP \f3elapsed\fP @@ -414,6 +416,7 @@ seconds .TP \f3eligible\fP +When the job became eligible to run. .TP \f3end\fP @@ -467,19 +470,22 @@ It is in the form: \f3jobname\fP The name of the job or job step. +.TP +\f3layout\fP +What the layout of a step was when it was running. This can be used +to give you an idea of which node ran which rank in your job. + .TP \f3maxpages\fP Maximum page faults of a process. .TP \f3maxpagesnode\fP -Maximum page faults of a node. - +The node where the maxpages occured. .TP \f3maxpagestask\fP -Maximum page faults of a task. - +The task on maxpagesnode where the maxpages occured. .TP \f3maxrss\fP @@ -487,23 +493,23 @@ Maximum resident set size of a process. .TP \f3maxrssnode\fP -Maximum resident set size of a node. +The node where the maxrss occured. .TP \f3maxrsstask\fP -Maximum resident set size of a node. +The task on maxrssnode where the maxrss occured. .TP -\f3maxvsize\fP +\f3maxvmsize\fP Maximum Virtual Memory size of any process. .TP -\f3maxvsizenode\fP -Maximum Virtual Memory size of a node. +\f3maxvmsizenode\fP +The node where the maxvsize occured. .TP -\f3maxvsizetask\fP -Maximum Virtual Memory size of a task. +\f3maxvmsizetask\fP +The task on maxvsizenode where the maxvsize occured. .TP \f3mincpu\fP @@ -511,11 +517,11 @@ Minimum cpu of any process. .TP \f3mincpunode\fP -Minimum cpu of a node. +The node where the mincpu occured. .TP -\f3mincputasks\fP -Minimum cpu of a task. +\f3mincputask\fP +The task on mincpunode where the mincpu occured. .TP \f3ncpus\fP @@ -527,11 +533,11 @@ List of nodes in job/step. .TP \f3nnodes\fP -Number of nodes. +Number of nodes in a job or step. .TP \f3ntasks\fP -Total number of tasks in a job. +Total number of tasks in a job or step. .TP \f3priority\fP @@ -543,10 +549,11 @@ Identifies the partition on which the job ran. .TP \f3qos\fP -Quality of service. +Name of Quality of Service. .TP \f3qosraw\fP +Id of Quality of Service. .TP \f3reqcpus\fP @@ -554,13 +561,17 @@ Required CPUs. .TP \f3reserved\fP +How much wall clock time was used as reserved time for this job. This is +derived from how long a job was waiting from eligible time to when it +actually started. .TP \f3resvcpu\fP -Reserved CPUs. +Formatted time for how long (cpu secs) a job was reserved for. .TP \f3resvcpuraw\fP +Reserved CPUs in second format, not formatted. .TP \f3start\fP @@ -570,7 +581,8 @@ Initiation time of the job in the same format as \f3end\fP. \f3state\fP Displays the job status, or state. -Output can be RUNNING, SUSPENDED, COMPLETED, CANCELLED, FAILED, TIMEOUT, or NODE_FAIL. +Output can be RUNNING, SUSPENDED, COMPLETED, CANCELLED, FAILED, +TIMEOUT, or NODE_FAIL. .TP \f3submit\fP @@ -579,6 +591,7 @@ was submitted. The format of the output is identical to that of the end field. .TP \f3suspended\fP +How long the job was suspended for. .TP \f3systemcpu\fP @@ -591,9 +604,15 @@ field. .TP \f3timelimit\fP +What the timelimit was/is for the job. .TP \f3totalcpu\fP +The total amount CPU time actually used by the job, not just +accounted for (which most likely is a higher number). (If job was +running on multiple cpus this is a combination of all the times so +this number could be much larger than the elapsed time.) The format of +the output is identical to that of the elapsed field. .TP \f3uid\fP @@ -605,7 +624,7 @@ The user name of the user who ran the job. .TP \f3usercpu\fP -The amount of user CPU time. (If job was running on mul-tiple cpus +The amount of user CPU time. (If job was running on multiple cpus this is a combination of all the times so this number could be much larger than the elapsed time.) The format of the output is identical to that of the elapsed field. diff --git a/doc/man/man1/sacctmgr.1 b/doc/man/man1/sacctmgr.1 index 9a348f13be59e486c38a155837f9a69e826da38a..446bed2c7f639eb39f3d7481de2ab5828683c769 100644 --- a/doc/man/man1/sacctmgr.1 +++ b/doc/man/man1/sacctmgr.1 @@ -1,4 +1,4 @@ -.TH SACCTMGR "1" "October 2008" "sacctmgr 2.0" "Slurm components" +.TH SACCTMGR "1" "April 2009" "sacctmgr 2.0" "Slurm components" .SH "NAME" sacctmgr \- Used to view and modify Slurm account information. @@ -215,13 +215,14 @@ Number used in conjunction with other accounts to determine job priority. To clear a previously set value use the modify command with a new value of \-1. .TP -\fIGrpCPUMins\fP=<max cpu hours> -Maximum number of CPU hours running jobs are able to be allocated in aggregate for -this association and all association which are children of this association. +\fIGrpCPUMins\fP=<max cpu minutes> +Maximum number of CPU minutes running jobs are able to be allocated in +aggregate for this association and all association which are children +of this association. To clear a previously set value use the modify command with a new -value of \-1. (NOTE: This limit is not currently enforced in SLURM. -You can still set this, but have to wait for future versions of SLURM -before it is enforced.) +value of \-1. (NOTE: This limit is not enforced if set on the root +association of a cluster. So even though it may appear in sacctmgr +output it will not be enforced.) .TP \fIGrpCPUs\fP=<max cpus> @@ -256,6 +257,9 @@ To clear a previously set value use the modify command with a new value of \-1. Maximum wall clock time running jobs are able to be allocated in aggregate for this association and all association which are children of this association. To clear a previously set value use the modify command with a new value of \-1. +(NOTE: This limit is not enforced if set on the root +association of a cluster. So even though it may appear in sacctmgr +output it will not be enforced.) .TP \fIMaxCPUMins\fP=<max cpu minutes> @@ -263,9 +267,7 @@ Maximum number of CPU minutes each job is able to use in this account. This is overridden if set directly on a user. Default is the cluster's limit. To clear a previously set value use the modify command with a new -value of \-1. (NOTE: This limit is not currently enforced in SLURM. -You can still set this, but have to wait for future versions of SLURM -before it is enforced.) +value of \-1. .TP \fIMaxCPUs\fP=<max cpus> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 1303e203fba5f0d1f17ac32fa6b4578a4e3a29d1..3bec13dd7e63abf3a9987f306ad1a4790a94b092 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1,4 +1,4 @@ -.TH "slurm.conf" "5" "March 2009" "slurm.conf 2.0" "Slurm configuration file" +.TH "slurm.conf" "5" "April 2009" "slurm.conf 2.0" "Slurm configuration file" .SH "NAME" slurm.conf \- Slurm configuration file @@ -707,15 +707,19 @@ on SPANK plugins, see the \fBspank\fR(8) manual. \fBPriorityDecayHalfLife\fR This controls how long prior resource use is considered in determining how over\- or under\-serviced an association is (user, bank account and -cluster) in determining job priority. +cluster) in determining job priority. If set to 0 no decay will be applied. +This is helpful if you want to enforce hard time limits per association. If +set to 0 PriorityUsageResetPeriod must be set to some interval. Applicable only if PriorityType=priority/multifactor. -The units are minutes and the default value is 7 days. +The unit is a time string (i.e. min, hr:min:00, days-hr:min:00, +or days-hr). The default value is 7-0 (7 days). .TP \fBPriorityFavorSmall\fR Specifies that small jobs should be given preferencial scheduling priority. Applicable only if PriorityType=priority/multifactor. Supported values are "YES" and "NO". +Applicable only if PriorityType=priority/multifactor. The default value is "NO". .TP @@ -724,7 +728,21 @@ Specifies the job age which will be given the maximum age factor in computing priority. For example, a value of 30 minutes would result in all jobs over 30 minutes old would get the same age\-based priority. Applicable only if PriorityType=priority/multifactor. -The units are minutes and the default value is 7 days. +The unit is a time string (i.e. min, hr:min:00, days-hr:min:00, +or days-hr). The default value is 7-0 (7 days). + +.TP +\fBPriorityUsageResetPeriod\fR +At this interval the usage of associations will be reset to 0. This is used +if you want to enforce hard limits of time usage per association. If +PriorityDecayHalfLife is set to be 0 no decay will happen and this is the +only way to reset the usage accumulated by running jobs. By default this is +turned off and it is advised to use the PriorityDecayHalfLife option to avoid +not having anything running on your cluster, but if your schema is set up to +only allow certain amounts of time on your system this is the way to do it. +Applicable only if PriorityType=priority/multifactor. +The unit is a time string (i.e. min, hr:min:00, days-hr:min:00, +or days-hr). The default value is not set (turned off). .TP \fBPriorityType\fR diff --git a/etc/slurm.conf.example b/etc/slurm.conf.example index 12d9449633eaaaf41ac52b5910d8a5f8d0dc04e4..425f4ff952f2e0454538fab744f2b2250e5eb9f2 100644 --- a/etc/slurm.conf.example +++ b/etc/slurm.conf.example @@ -65,7 +65,8 @@ SchedulerType=sched/backfill SelectType=select/linear FastSchedule=1 #PriorityType=priority/multifactor -#PriorityDecayHalfLife=4:00:00 +#PriorityDecayHalfLife=14-0 +#PriorityUsageResetPeriod=14-0 #PriorityWeightFairshare=100000 #PriorityWeightAge=1000 #PriorityWeightPartition=10000 diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 454a6e9e53a790f070fafb52466629aa3e8ab9ba..554bba1c98e27aa360d0a2fd96b519f9f69c6208 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1118,6 +1118,8 @@ typedef struct slurm_ctl_conf { uint16_t priority_favor_small; /* favor small jobs over large */ uint32_t priority_max_age; /* time when not to add any more * priority to a job if reached */ + uint32_t priority_reset_period; /* time period to wait before + * resetting usage in seconds */ char *priority_type; /* priority type plugin */ uint32_t priority_weight_age; /* weight for age factor */ uint32_t priority_weight_fs; /* weight for Fairshare factor */ diff --git a/src/api/config_info.c b/src/api/config_info.c index ee54f2789d5000739b54da172e56b0a9b464293c..e917d2e6b904bde51e4a7c6f0f4fd9fe01ac1bff 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -274,6 +274,10 @@ void slurm_print_ctl_conf ( FILE* out, secs2time_str((time_t) slurm_ctl_conf_ptr->priority_max_age, tmp_str, sizeof(tmp_str)); fprintf(out, "PriorityMaxAge = %s\n", tmp_str); + secs2time_str((time_t) + slurm_ctl_conf_ptr->priority_reset_period, + tmp_str, sizeof(tmp_str)); + fprintf(out, "PriorityUsageResetPeriod= %s\n", tmp_str); fprintf(out, "PriorityType = %s\n", slurm_ctl_conf_ptr->priority_type); fprintf(out, "PriorityWeightAge = %u\n", diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index 9f2645341157fd3fe96fefc49657116112bba62b..3dac9dad77d0a96d7d9b1c6cefc2ccb8f4d7f2ca 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -104,7 +104,6 @@ static int _addto_used_info(acct_association_rec_t *assoc1, if(!assoc1 || !assoc2) return SLURM_ERROR; - assoc1->grp_used_cpu_mins += assoc2->grp_used_cpu_mins; assoc1->grp_used_cpus += assoc2->grp_used_cpus; assoc1->grp_used_nodes += assoc2->grp_used_nodes; assoc1->grp_used_wall += assoc2->grp_used_wall; @@ -121,14 +120,13 @@ static int _clear_used_info(acct_association_rec_t *assoc) if(!assoc) return SLURM_ERROR; - assoc->grp_used_cpu_mins = 0; assoc->grp_used_cpus = 0; assoc->grp_used_nodes = 0; - assoc->grp_used_wall = 0; assoc->used_jobs = 0; assoc->used_submit_jobs = 0; - /* do not reset usage_raw if you need to reset it do it + /* do not reset usage_raw or grp_used_wall. + * if you need to reset it do it * else where since sometimes we call this and do not want * shares reset */ @@ -1746,6 +1744,7 @@ extern int assoc_mgr_update_assocs(acct_update_object_t *update) if(!object->user) { _clear_used_info(object); object->usage_raw = 0; + object->grp_used_wall = 0; } _set_assoc_parent_and_user( object, assoc_mgr_association_list, reset); @@ -2251,7 +2250,6 @@ extern int dump_assoc_mgr_state(char *state_save_location) if(assoc_mgr_association_list) { ListIterator itr = NULL; acct_association_rec_t *assoc = NULL; - slurm_mutex_lock(&assoc_mgr_association_lock); itr = list_iterator_create(assoc_mgr_association_list); while((assoc = list_next(itr))) { @@ -2263,6 +2261,7 @@ extern int dump_assoc_mgr_state(char *state_save_location) anything under 1 we are dropping */ pack64((uint64_t)assoc->usage_raw, buffer); + pack32(assoc->grp_used_wall, buffer); } list_iterator_destroy(itr); slurm_mutex_unlock(&assoc_mgr_association_lock); @@ -2381,22 +2380,24 @@ extern int load_assoc_usage(char *state_save_location) itr = list_iterator_create(assoc_mgr_association_list); while (remaining_buf(buffer) > 0) { uint32_t assoc_id = 0; - uint64_t uint64_tmp = 0; + uint32_t grp_used_wall = 0; + uint64_t usage_raw = 0; acct_association_rec_t *assoc = NULL; safe_unpack32(&assoc_id, buffer); - safe_unpack64(&uint64_tmp, buffer); - while((assoc = list_next(itr))) { - if(!assoc->user) - continue; + safe_unpack64(&usage_raw, buffer); + safe_unpack32(&grp_used_wall, buffer); + while((assoc = list_next(itr))) if(assoc->id == assoc_id) break; - } - if(assoc) { - while(assoc) { - assoc->usage_raw += (long double)uint64_tmp; - assoc = assoc->parent_assoc_ptr; - } + + while(assoc) { + assoc->grp_used_wall += grp_used_wall; + assoc->usage_raw += (long double)usage_raw; + + assoc = assoc->parent_assoc_ptr; + if(assoc == assoc_mgr_root_assoc) + break; } list_iterator_reset(itr); } @@ -2409,6 +2410,10 @@ extern int load_assoc_usage(char *state_save_location) unpack_error: if(buffer) free_buf(buffer); + if(itr) { + list_iterator_destroy(itr); + slurm_mutex_unlock(&assoc_mgr_association_lock); + } return SLURM_ERROR; } diff --git a/src/common/print_fields.c b/src/common/print_fields.c index b460cd47694a116ace508c8d4ef6548bf8b5634d..0681f70dad9a7f90424a053e04800ca83fb363a3 100644 --- a/src/common/print_fields.c +++ b/src/common/print_fields.c @@ -86,11 +86,7 @@ extern void print_fields_header(List print_fields_list) printf("%s|", field->name); else { int abs_len = abs(field->len); - if(field->len == abs_len) - printf("%*.*s ", abs_len, abs_len, field->name); - else - printf("%-*.*s ", abs_len, abs_len, - field->name); + printf("%*.*s ", abs_len, abs_len, field->name); } curr_inx++; } diff --git a/src/common/read_config.c b/src/common/read_config.c index 8c40bc56064321d169e0340dbb68ce6502f72873..3e064982e5d1669f4444f00b0fe412c6d70e9466 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -202,6 +202,7 @@ s_p_options_t slurm_conf_options[] = { {"PriorityDecayHalfLife", S_P_STRING}, {"PriorityFavorSmall", S_P_BOOLEAN}, {"PriorityMaxAge", S_P_STRING}, + {"PriorityUsageResetPeriod", S_P_STRING}, {"PriorityType", S_P_STRING}, {"PriorityWeightAge", S_P_UINT32}, {"PriorityWeightFairshare", S_P_UINT32}, @@ -1978,6 +1979,25 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) } else conf->priority_max_age = DEFAULT_PRIORITY_DECAY; + if (s_p_get_string(&temp_str, "PriorityUsageResetPeriod", hashtbl)) { + int max_time = time_str2mins(temp_str); + if ((max_time < 0) && (max_time != INFINITE)) { + fatal("Bad value \"%s\" for PriorityUsageResetPeriod", + temp_str); + } + conf->priority_reset_period = max_time * 60; + xfree(temp_str); + } else { + conf->priority_reset_period = NO_VAL; + if(!conf->priority_decay_hl) { + fatal("You have to either have " + "PriorityDecayHalfLife != 0 or " + "PriorityUsageResetPeriod set to something " + "or the priority plugin will result in " + "rolling over."); + } + } + if (!s_p_get_string(&conf->priority_type, "PriorityType", hashtbl)) conf->priority_type = xstrdup(DEFAULT_PRIORITY_TYPE); diff --git a/src/common/slurm_accounting_storage.h b/src/common/slurm_accounting_storage.h index 0b292d10e43e3ab1a41946368b6c254224458867..9a412191a48a32f9b3fcce8e30067a8191195592 100644 --- a/src/common/slurm_accounting_storage.h +++ b/src/common/slurm_accounting_storage.h @@ -146,7 +146,7 @@ typedef struct acct_association_rec { char *cluster; /* cluster associated to association * */ - uint64_t grp_cpu_mins; /* max number of cpu hours the + uint64_t grp_cpu_mins; /* max number of cpu minutes the * underlying group of * associations can run for */ uint32_t grp_cpus; /* max number of cpus the @@ -166,10 +166,6 @@ typedef struct acct_association_rec { * underlying group of * associations can run for */ - uint32_t grp_used_cpu_mins; /* cpu mins the - * underlying group of - * associations has ran for - * (DON'T PACK) */ uint32_t grp_used_cpus; /* count of active jobs in the group * (DON'T PACK) */ uint32_t grp_used_nodes; /* count of active jobs in the group @@ -304,8 +300,6 @@ typedef struct { * one time */ uint32_t grp_wall; /* total time in hours this qos can run for */ - uint32_t grp_used_cpu_mins; /* cpu hours this qos has ran for - * (DON'T PACK) */ uint32_t grp_used_cpus; /* count of cpus in use in this qos * (DON'T PACK) */ uint32_t grp_used_jobs; /* count of active jobs (DON'T PACK) */ @@ -338,6 +332,8 @@ typedef struct { uint32_t priority; /* ranged int needs to be a unint for * heterogeneous systems */ double usage_factor; /* factor to apply to usage in this qos */ + long double usage_raw; /* measure of resource usage (DON'T PACK) */ + List user_limit_list; /* acct_used_limits_t's */ } acct_qos_rec_t; diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 324086f85b088cfa8ab7fea3f920c016f460728e..5374d0a1f695a28175f217878d1a87eefeb9bb4c 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -464,6 +464,25 @@ uint32_t slurm_get_priority_max_age(void) return age; } +/* slurm_get_priority_reset_period + * returns the priority usage reset period in seconds from slurmctld_conf object + * RET uint32_t - decay_hl in secs. + */ +uint32_t slurm_get_priority_reset_period(void) +{ + uint32_t reset_period = NO_VAL; + slurm_ctl_conf_t *conf; + + if(slurmdbd_conf) { + } else { + conf = slurm_conf_lock(); + reset_period = conf->priority_reset_period; + slurm_conf_unlock(); + } + + return reset_period; +} + /* slurm_get_priority_type * returns the priority type from slurmctld_conf object * RET char * - priority type, MUST be xfreed by caller diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index f8a0581d7c417a5560cb89c6a58b219a1cb9c7c5..38448430d5c54e3b23ef9f81bc81b63ebd101672 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -202,6 +202,12 @@ bool slurm_get_priority_favor_small(void); */ uint32_t slurm_get_priority_max_age(void); +/* slurm_get_priority_reset_period + * returns the priority usage reset period in seconds from slurmctld_conf object + * RET uint32_t - decay_hl in secs. + */ +uint32_t slurm_get_priority_reset_period(void); + /* slurm_get_priority_type * returns the priority type from slurmctld_conf object * RET char * - priority type, MUST be xfreed by caller diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 45ae88a54e1d49a400e6ac8f19529abe85ac0120..9016ec5ee8f029b9460289ba48378cb5c2f06756 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2852,6 +2852,7 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) pack32(build_ptr->priority_decay_hl, buffer); pack16(build_ptr->priority_favor_small, buffer); pack32(build_ptr->priority_max_age, buffer); + pack32(build_ptr->priority_reset_period, buffer); packstr(build_ptr->priority_type, buffer); pack32(build_ptr->priority_weight_age, buffer); pack32(build_ptr->priority_weight_fs, buffer); @@ -3040,6 +3041,7 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** safe_unpack32(&build_ptr->priority_decay_hl, buffer); safe_unpack16(&build_ptr->priority_favor_small, buffer); safe_unpack32(&build_ptr->priority_max_age, buffer); + safe_unpack32(&build_ptr->priority_reset_period, buffer); safe_unpackstr_xmalloc(&build_ptr->priority_type, &uint32_tmp, buffer); safe_unpack32(&build_ptr->priority_weight_age, buffer); diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index 0033f70ed8a581b25ace7a9dcb2a52316f3326d0..944f91b2fb69da2754f1a62965729ae86214be09 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -125,15 +125,18 @@ static int _apply_decay(double decay_factor) { ListIterator itr = NULL; acct_association_rec_t *assoc = NULL; + acct_qos_rec_t *qos = NULL; - if(!calc_fairshare) + /* continue if decay_factor is 0 or 1 since that doesn't help + us at all. 1 means no decay and 0 will just zero + everything out so don't waste time doing it */ + if(!decay_factor) + return SLURM_ERROR; + else if(!calc_fairshare) return SLURM_SUCCESS; xassert(assoc_mgr_association_list); - if(!decay_factor) - return SLURM_ERROR; - slurm_mutex_lock(&assoc_mgr_association_lock); itr = list_iterator_create(assoc_mgr_association_list); while((assoc = list_next(itr))) { @@ -144,17 +147,69 @@ static int _apply_decay(double decay_factor) list_iterator_destroy(itr); slurm_mutex_unlock(&assoc_mgr_association_lock); + slurm_mutex_lock(&assoc_mgr_qos_lock); + itr = list_iterator_create(assoc_mgr_qos_list); + while((qos = list_next(itr))) { + qos->usage_raw *= decay_factor; + } + list_iterator_destroy(itr); + slurm_mutex_unlock(&assoc_mgr_qos_lock); + return SLURM_SUCCESS; } -static time_t _read_last_decay_ran() +/* + * reset usage_raw, and grp_used_cpu_mins on all associations + * This should be called every PriorityUsageResetPeriod + * RET: SLURM_SUCCESS on SUCCESS, SLURM_ERROR else. + */ +static int _reset_usage() +{ + ListIterator itr = NULL; + acct_association_rec_t *assoc = NULL; + acct_qos_rec_t *qos = NULL; + + if(!calc_fairshare) + return SLURM_SUCCESS; + + xassert(assoc_mgr_association_list); + + slurm_mutex_lock(&assoc_mgr_association_lock); + itr = list_iterator_create(assoc_mgr_association_list); + while((assoc = list_next(itr))) { + if (assoc == assoc_mgr_root_assoc) + continue; + assoc->usage_raw = 0; + assoc->grp_used_wall = 0; + } + list_iterator_destroy(itr); + slurm_mutex_unlock(&assoc_mgr_association_lock); + + slurm_mutex_lock(&assoc_mgr_qos_lock); + itr = list_iterator_create(assoc_mgr_qos_list); + while((qos = list_next(itr))) { + qos->usage_raw = 0; + } + list_iterator_destroy(itr); + slurm_mutex_unlock(&assoc_mgr_qos_lock); + + return SLURM_SUCCESS; +} + +static void _read_last_decay_ran(time_t *last_ran, time_t *last_reset) { int data_allocated, data_read = 0; uint32_t data_size = 0; int state_fd; char *data = NULL, *state_file; Buf buffer; - time_t last_ran = 0; + + xassert(last_ran); + xassert(last_reset); + + (*last_ran) = 0; + (*last_reset) = 0; + /* read the file */ state_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(state_file, "/priority_last_decay_ran"); @@ -163,7 +218,7 @@ static time_t _read_last_decay_ran() if (state_fd < 0) { info("No last decay (%s) to recover", state_file); unlock_state_files(); - return 0; + return; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -190,20 +245,21 @@ static time_t _read_last_decay_ran() unlock_state_files(); buffer = create_buf(data, data_size); - safe_unpack_time(&last_ran, buffer); + safe_unpack_time(last_ran, buffer); + safe_unpack_time(last_reset, buffer); free_buf(buffer); debug5("Last ran decay on jobs at %d", last_ran); - return last_ran; + return; unpack_error: - error("Incomplete priority last decay file returning no last ran"); + error("Incomplete priority last decay file returning"); free_buf(buffer); - return 0; + return; } -static int _write_last_decay_ran(time_t last_ran) +static int _write_last_decay_ran(time_t last_ran, time_t last_reset) { /* Save high-water mark to avoid buffer growth with copies */ static int high_buffer_size = BUF_SIZE; @@ -213,6 +269,7 @@ static int _write_last_decay_ran(time_t last_ran) Buf buffer = init_buf(high_buffer_size); pack_time(last_ran, buffer); + pack_time(last_reset, buffer); /* read the file */ old_file = xstrdup(slurmctld_conf.state_save_location); @@ -469,8 +526,16 @@ static void *_decay_thread(void *no_data) /* int sigarray[] = {SIGUSR1, 0}; */ struct tm tm; time_t last_ran = 0; - double decay_factor = - 1 - (0.693 / (double)slurm_get_priority_decay_hl()); + time_t last_reset = 0; + double decay_hl = (double)slurm_get_priority_decay_hl(); + double decay_factor = 1; + uint32_t reset_period = slurm_get_priority_reset_period(); + /* if decay_hl is 0 or less that means no decay is to be had. + This also means we flush the used time at a certain time + set by PriorityUsageResetPeriod in the slurm.conf + */ + if(decay_hl > 0) + decay_factor = 1 - (0.693 / decay_hl); /* Write lock on jobs, read lock on nodes and partitions */ slurmctld_lock_t job_write_lock = @@ -486,7 +551,7 @@ static void *_decay_thread(void *no_data) return NULL; } - last_ran = _read_last_decay_ran(); + _read_last_decay_ran(&last_ran, &last_reset); while(1) { int run_delta = 0; @@ -498,12 +563,33 @@ static void *_decay_thread(void *no_data) /* If reconfig is called handle all that happens outside of the loop here */ if(reconfig) { - decay_factor = - 1 - (0.693 - / (double)slurm_get_priority_decay_hl()); + /* if decay_hl is 0 or less that means no + decay is to be had. This also means we + flush the used time at a certain time + set by PriorityUsageResetPeriod in the slurm.conf + */ + reset_period = slurm_get_priority_reset_period(); + decay_hl = (double)slurm_get_priority_decay_hl(); + if(decay_hl > 0) + decay_factor = 1 - (0.693 / decay_hl); + else + decay_factor = 1; + reconfig = 0; } + /* this needs to be done right away so as to + incorporate it into the decay loop. + */ + if(reset_period != (uint32_t)NO_VAL) { + if(!last_reset) + last_reset = start_time; + else if(start_time >= last_reset+reset_period) { + _reset_usage(); + last_reset = start_time; + } + } + if(!last_ran) goto get_usage; else @@ -538,6 +624,7 @@ static void *_decay_thread(void *no_data) job_ptr->assoc_ptr; time_t start_period = last_ran; time_t end_period = start_time; + uint64_t cpu_time = 0; if(job_ptr->start_time > start_period) start_period = job_ptr->start_time; @@ -556,28 +643,44 @@ static void *_decay_thread(void *no_data) debug4("job %u ran for %d seconds", job_ptr->job_id, run_delta); + /* get run time in seconds */ + cpu_time = (uint64_t)run_delta + * (uint16_t)job_ptr->total_procs; /* figure out the decayed new usage to add */ - real_decay = ((double)run_delta - * (double)job_ptr->total_procs) + real_decay = (double)cpu_time * pow(decay_factor, (double)run_delta); /* now apply the usage factor for this qos */ - if(qos && qos->usage_factor > 0) - real_decay *= qos->usage_factor; + if(qos) { + slurm_mutex_lock(&assoc_mgr_qos_lock); + if(qos->usage_factor > 0) + real_decay *= qos->usage_factor; + qos->usage_raw += + (long double)real_decay; + slurm_mutex_unlock(&assoc_mgr_qos_lock); + } slurm_mutex_lock(&assoc_mgr_association_lock); while(assoc) { + assoc->grp_used_wall += run_delta; assoc->usage_raw += (long double)real_decay; debug4("adding %f new usage to " "assoc %u (user='%s' acct='%s') " - "raw usage is now %Lf", + "raw usage is now %Lf. Group " + "wall added %d making it %d.", real_decay, assoc->id, assoc->user, assoc->acct, - assoc->usage_raw); + assoc->usage_raw, run_delta, + assoc->grp_used_wall); + assoc = assoc->parent_assoc_ptr; + /* we don't want to make the + root assoc responsible for + keeping track of time + */ if (assoc == assoc_mgr_root_assoc) break; } @@ -610,7 +713,7 @@ static void *_decay_thread(void *no_data) last_ran = start_time; - _write_last_decay_ran(last_ran); + _write_last_decay_ran(last_ran, last_reset); running_decay = 0; slurm_mutex_unlock(&decay_lock); diff --git a/src/sacct/sacct.c b/src/sacct/sacct.c index 6676c7946e246c75be4eb197521d14c723d93627..6445cbd7cad169e5c39f96a0c40efa22418001b0 100644 --- a/src/sacct/sacct.c +++ b/src/sacct/sacct.c @@ -77,8 +77,8 @@ print_field_t fields[] = { {10, "MinCPUNode", print_fields_str, PRINT_MINCPUNODE}, {10, "MinCPUTask", print_fields_int, PRINT_MINCPUTASK}, {10, "NCPUS", print_fields_int, PRINT_ALLOC_CPUS}, - {8, "NNodes", print_fields_str, PRINT_NNODES}, {15, "NodeList", print_fields_str, PRINT_NODELIST}, + {8, "NNodes", print_fields_str, PRINT_NNODES}, {8, "NTasks", print_fields_int, PRINT_NTASKS}, {10, "Priority", print_fields_int, PRINT_PRIO}, {10, "Partition", print_fields_str, PRINT_PARTITION}, diff --git a/src/sacctmgr/association_functions.c b/src/sacctmgr/association_functions.c index 031ec2502182b2a7f50566118700f7ef6fde3901..14c9e34c39e9e90bcabeb3966856295720208ab4 100644 --- a/src/sacctmgr/association_functions.c +++ b/src/sacctmgr/association_functions.c @@ -358,7 +358,8 @@ extern int sacctmgr_list_association(int argc, char *argv[]) return SLURM_ERROR; } else if(!list_count(format_list)) slurm_addto_char_list(format_list, - "C,A,U,Part,F,GrpJ,GrpN,GrpS," + "C,A,U,Part,F," + "GrpCPUMins,GrpJ,GrpN,GrpS,GrpWall," "MaxJ,MaxN,MaxS,MaxW,QOS"); print_fields_list = list_create(destroy_print_field); @@ -383,7 +384,7 @@ extern int sacctmgr_list_association(int argc, char *argv[]) field->type = PRINT_ACCOUNT; field->name = xstrdup("Account"); if(tree_display) - field->len = 20; + field->len = -20; else field->len = 10; field->print_routine = print_fields_str; diff --git a/src/sacctmgr/cluster_functions.c b/src/sacctmgr/cluster_functions.c index 636bfc1450626b1d7b1f93c1556519576c116137..62df3d4922cd88e90c0f4b992bb9a7b194c1a78b 100644 --- a/src/sacctmgr/cluster_functions.c +++ b/src/sacctmgr/cluster_functions.c @@ -139,10 +139,10 @@ static int _set_rec(int *start, int argc, char *argv[], set = 1; } else if (!strncasecmp (argv[i], "GrpCPUMins", MAX(command_len, 7))) { - if (get_uint64(argv[i]+end, - &assoc->grp_cpu_mins, - "GrpCPUMins") == SLURM_SUCCESS) - set = 1; + exit_code=1; + fprintf(stderr, "GrpCPUMins is not a valid option " + "for the root association of a cluster.\n"); + break; } else if (!strncasecmp (argv[i], "GrpCpus", MAX(command_len, 7))) { if (get_uint(argv[i]+end, &assoc->grp_cpus, @@ -165,16 +165,9 @@ static int _set_rec(int *start, int argc, char *argv[], set = 1; } else if (!strncasecmp (argv[i], "GrpWall", MAX(command_len, 4))) { - mins = time_str2mins(argv[i]+end); - if (mins != NO_VAL) { - assoc->grp_wall = (uint32_t) mins; - set = 1; - } else { - exit_code=1; - fprintf(stderr, - " Bad GrpWall time format: %s\n", - argv[i]); - } + exit_code=1; + fprintf(stderr, "GrpWall is not a valid option " + "for the root association of a cluster.\n"); } else if (!strncasecmp (argv[i], "MaxCPUMinsPerJob", MAX(command_len, 7))) { if (get_uint64(argv[i]+end, @@ -329,13 +322,11 @@ extern int sacctmgr_add_cluster(int argc, char *argv[]) cluster->root_assoc->shares_raw = start_assoc.shares_raw; - cluster->root_assoc->grp_cpu_mins = start_assoc.grp_cpu_mins; cluster->root_assoc->grp_cpus = start_assoc.grp_cpus; cluster->root_assoc->grp_jobs = start_assoc.grp_jobs; cluster->root_assoc->grp_nodes = start_assoc.grp_nodes; cluster->root_assoc->grp_submit_jobs = start_assoc.grp_submit_jobs; - cluster->root_assoc->grp_wall = start_assoc.grp_wall; cluster->root_assoc->max_cpu_mins_pj = start_assoc.max_cpu_mins_pj; @@ -490,12 +481,6 @@ extern int sacctmgr_list_cluster(int argc, char *argv[]) field->name = xstrdup("FairShare"); field->len = 9; field->print_routine = print_fields_uint; - } else if(!strncasecmp("GrpCPUMins", object, - MAX(command_len, 8))) { - field->type = PRINT_GRPCM; - field->name = xstrdup("GrpCPUMins"); - field->len = 11; - field->print_routine = print_fields_uint64; } else if(!strncasecmp("GrpCPUs", object, MAX(command_len, 8))) { field->type = PRINT_GRPC; @@ -520,12 +505,6 @@ extern int sacctmgr_list_cluster(int argc, char *argv[]) field->name = xstrdup("GrpSubmit"); field->len = 9; field->print_routine = print_fields_uint; - } else if(!strncasecmp("GrpWall", object, - MAX(command_len, 4))) { - field->type = PRINT_GRPW; - field->name = xstrdup("GrpWall"); - field->len = 11; - field->print_routine = print_fields_time; } else if(!strncasecmp("MaxCPUMinsPerJob", object, MAX(command_len, 7))) { field->type = PRINT_MAXCM; @@ -672,12 +651,6 @@ extern int sacctmgr_list_cluster(int argc, char *argv[]) assoc->shares_raw, (curr_inx == field_count)); break; - case PRINT_GRPCM: - field->print_routine( - field, - assoc->grp_cpu_mins, - (curr_inx == field_count)); - break; case PRINT_GRPC: field->print_routine(field, assoc->grp_cpus, @@ -698,12 +671,6 @@ extern int sacctmgr_list_cluster(int argc, char *argv[]) assoc->grp_submit_jobs, (curr_inx == field_count)); break; - case PRINT_GRPW: - field->print_routine( - field, - assoc->grp_wall, - (curr_inx == field_count)); - break; case PRINT_MAXCM: field->print_routine( field, diff --git a/src/sacctmgr/sacctmgr.c b/src/sacctmgr/sacctmgr.c index c0eafdaee84156f063d32267aa05f96acf4d4a16..9719fa9e095f2a6a61c144c577e35eff90cbdb4f 100644 --- a/src/sacctmgr/sacctmgr.c +++ b/src/sacctmgr/sacctmgr.c @@ -811,13 +811,12 @@ sacctmgr [<OPTION>] [<COMMAND>] \n\ and WOPLimits \n\ \n\ list cluster - Format=, Names= \n\ - add cluster - Fairshare=, GrpCPUMins=, GrpCPUs=, GrpJobs=, \n\ - GrpNodes=, GrpSubmitJob=, GrpWall=, MaxCPUMins=\n\ + add cluster - Fairshare=, GrpCPUs=, GrpJobs=, \n\ + GrpNodes=, GrpSubmitJob=, MaxCPUMins= \n\ MaxJobs=, MaxNodes=, MaxWall=, and Name= \n\ - modify cluster - (set options) Fairshare=, GrpCPUMins=, \n\ + modify cluster - (set options) Fairshare=, \n\ GrpCPUs=, GrpJobs=, GrpNodes=, GrpSubmitJob=, \n\ - GrpWall=, MaxCPUMins=, MaxJobs=, MaxNodes=, \n\ - and MaxWall= \n\ + MaxCPUMins=, MaxJobs=, MaxNodes=, and MaxWall= \n\ (where options) Names= \n\ delete cluster - Names= \n\ \n\ @@ -875,8 +874,8 @@ sacctmgr [<OPTION>] [<COMMAND>] \n\ User \n\ \n\ Cluster - Cluster, ControlHost, ControlPort, CpuCount, \n\ - Fairshare, GrpCPUMins, GrpCPUs, GrpJobs, \n\ - GrpNodes, GrpSubmitJob, GrpWall, MaxCPUs, \n\ + Fairshare, GrpCPUs, GrpJobs, \n\ + GrpNodes, GrpSubmitJob, MaxCPUs, \n\ MaxCPUMins, MaxJobs, MaxNodes, MaxSubmitJobs, \n\ MaxWall, NodeCount, NodeNames \n\ \n\ diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index 21c895b245b7717c55d19eb300c15efd05482a35..8138a14c9c704b09d947b3df6f69e89c9f55af1d 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -245,13 +245,29 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) slurm_mutex_lock(&assoc_mgr_association_lock); assoc_ptr = job_ptr->assoc_ptr; while(assoc_ptr) { + uint64_t usage_mins = + (uint64_t)(assoc_ptr->usage_raw / 60.0); + uint32_t wall_mins = assoc_ptr->grp_used_wall / 60; #if _DEBUG info("acct_job_limits: %u of %u", assoc_ptr->used_jobs, assoc_ptr->max_jobs); #endif - /* NOTE: We can't enforce assoc_ptr->grp_cpu_mins at this - * time because we aren't keeping track of how long - * jobs have been running yet */ + if ((assoc_ptr->grp_cpu_mins != (uint64_t)NO_VAL) + && (assoc_ptr->grp_cpu_mins != (uint64_t)INFINITE) + && (usage_mins >= assoc_ptr->grp_cpu_mins)) { + job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT; + xfree(job_ptr->state_desc); + debug2("job %u being held, " + "assoc %u is at or exceeds " + "group max cpu minutes limit %llu " + "with %Lf for account %s", + job_ptr->job_id, assoc_ptr->id, + assoc_ptr->grp_cpu_mins, + assoc_ptr->usage_raw, assoc_ptr->acct); + + rc = false; + goto end_it; + } if ((assoc_ptr->grp_jobs != NO_VAL) && (assoc_ptr->grp_jobs != INFINITE) && @@ -302,27 +318,22 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr) } /* we don't need to check submit_jobs here */ - - /* FIX ME: Once we start tracking time of running jobs - * we will need to update the amount of time we have - * used and check against that here. When we start - * keeping track of time we will also need to come up - * with a way to refresh the time. - */ - if ((assoc_ptr->grp_wall != NO_VAL) && - (assoc_ptr->grp_wall != INFINITE)) { - time_limit = assoc_ptr->grp_wall; - if ((job_ptr->time_limit != NO_VAL) && - (job_ptr->time_limit > time_limit)) { - info("job %u being cancelled, " - "time limit %u exceeds group " - "time limit %u for account %s", - job_ptr->job_id, job_ptr->time_limit, - time_limit, assoc_ptr->acct); - _cancel_job(job_ptr); - rc = false; - goto end_it; - } + + if ((assoc_ptr->grp_wall != NO_VAL) + && (assoc_ptr->grp_wall != INFINITE) + && (wall_mins >= assoc_ptr->grp_wall)) { + job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT; + xfree(job_ptr->state_desc); + debug2("job %u being held, " + "assoc %u is at or exceeds " + "group wall limit %u " + "with %u for account %s", + job_ptr->job_id, assoc_ptr->id, + assoc_ptr->grp_wall, + wall_mins, assoc_ptr->acct); + + rc = false; + goto end_it; } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 7c89e61d88e6da6e8deb9d8e45ef3fc3d086faf2..9b823a89eb0a2d0d3579757d2ac26370da41ffae 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3364,7 +3364,7 @@ void job_time_limit(void) time_t old = now - slurmctld_conf.inactive_limit; time_t over_run; int resv_status = 0; - + uint64_t job_cpu_usage_mins = 0; if (slurmctld_conf.over_time_limit == (uint16_t) INFINITE) over_run = now - (365 * 24 * 60 * 60); /* one year */ else @@ -3372,14 +3372,26 @@ void job_time_limit(void) begin_job_resv_check(); job_iterator = list_iterator_create(job_list); - while ((job_ptr = - (struct job_record *) list_next(job_iterator))) { + while ((job_ptr =(struct job_record *) list_next(job_iterator))) { +/* acct_qos_rec_t *qos = NULL; */ + acct_association_rec_t *assoc = NULL; + xassert (job_ptr->magic == JOB_MAGIC); resv_status = job_resv_check(job_ptr); if (job_ptr->job_state != JOB_RUNNING) continue; +/* qos = (acct_qos_rec_t *)job_ptr->qos_ptr; */ + assoc = (acct_association_rec_t *)job_ptr->assoc_ptr; + + /* find out how many cpu minutes this job has been + running for. */ + job_cpu_usage_mins = (uint64_t) + ((((now - job_ptr->start_time) + - job_ptr->tot_sus_time) / 60) + * job_ptr->total_procs); + /* Consider a job active if it has any active steps */ if (job_ptr->step_list && (list_count(job_ptr->step_list) > 0)) @@ -3418,6 +3430,110 @@ void job_time_limit(void) continue; } + /* Too be added later once qos actually works. The + * idea here is for qos to trump what an association + * has set for a limit, so if an association set of + * wall 10 mins and the qos has 20 mins set and the + * job has been running for 11 minutes it continues + * until 20. + */ +/* if(qos) { */ +/* slurm_mutex_lock(&assoc_mgr_qos_lock); */ +/* if ((qos->grp_cpu_mins != (uint64_t)NO_VAL) */ +/* && (qos->grp_cpu_mins != (uint64_t)INFINITE) */ +/* && ((uint64_t)qos->usage_raw */ +/* >= qos->grp_cpu_mins)) { */ +/* last_job_update = now; */ +/* info("QOS %s group max cpu minutes is " */ +/* "at or exceeds %llu with %Lf for JobId=%u", */ +/* qos->name, qos->grp_cpu_mins, */ +/* qos->usage_raw, job_ptr->job_id); */ +/* _job_timed_out(job_ptr); */ +/* job_ptr->state_reason = FAIL_TIMEOUT; */ +/* } */ + +/* if ((qos->max_wall_pj != NO_VAL) */ +/* && (qos->max_wall_pj != INFINITE) */ +/* && (job_ptr-> >= qos->max_wall_pj)) { */ +/* last_job_update = now; */ +/* info("QOS %s group max cpu minutes is " */ +/* "at or exceeds %llu with %Lf for JobId=%u", */ +/* qos->name, qos->grp_cpu_mins, */ +/* qos->usage_raw, job_ptr->job_id); */ +/* _job_timed_out(job_ptr); */ +/* job_ptr->state_reason = FAIL_TIMEOUT; */ +/* } */ +/* slurm_mutex_unlock(&assoc_mgr_qos_lock); */ + +/* if(job_ptr->state_reason == FAIL_TIMEOUT) { */ +/* xfree(job_ptr->state_desc); */ +/* continue; */ +/* } */ +/* } */ + + /* handle any association stuff here */ + slurm_mutex_lock(&assoc_mgr_association_lock); + while(assoc) { + uint64_t usage_mins = + (uint64_t)(assoc->usage_raw / 60.0); + uint32_t wall_mins = assoc->grp_used_wall / 60; + + if ((assoc->grp_cpu_mins != (uint64_t)NO_VAL) + && (assoc->grp_cpu_mins != (uint64_t)INFINITE) + && (usage_mins >= assoc->grp_cpu_mins)) { + info("Job %u timed out, " + "assoc %u is at or exceeds " + "group max cpu minutes limit %llu " + "with %Lf for account %s", + job_ptr->job_id, assoc->id, + assoc->grp_cpu_mins, + usage_mins, assoc->acct); + job_ptr->state_reason = FAIL_TIMEOUT; + break; + } + + if ((assoc->grp_wall != NO_VAL) + && (assoc->grp_wall != INFINITE) + && (wall_mins >= assoc->grp_wall)) { + info("Job %u timed out, " + "assoc %u is at or exceeds " + "group wall limit %u " + "with %u for account %s", + job_ptr->job_id, assoc->id, + assoc->grp_wall, + wall_mins, assoc->acct); + job_ptr->state_reason = FAIL_TIMEOUT; + break; + } + + if ((assoc->max_cpu_mins_pj != (uint64_t)NO_VAL) + && (assoc->max_cpu_mins_pj != (uint64_t)INFINITE) + && (job_cpu_usage_mins >= assoc->max_cpu_mins_pj)) { + info("Job %u timed out, " + "assoc %u is at or exceeds " + "max cpu minutes limit %llu " + "with %Lf for account %s", + job_ptr->job_id, assoc->id, + assoc->max_cpu_mins_pj, + job_cpu_usage_mins, assoc->acct); + job_ptr->state_reason = FAIL_TIMEOUT; + break; + } + + assoc = assoc->parent_assoc_ptr; + /* these limits don't apply to the root assoc */ + if(assoc == assoc_mgr_root_assoc) + break; + } + slurm_mutex_unlock(&assoc_mgr_association_lock); + + if(job_ptr->state_reason == FAIL_TIMEOUT) { + last_job_update = now; + _job_timed_out(job_ptr); + xfree(job_ptr->state_desc); + continue; + } + /* Give srun command warning message about pending timeout */ if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2)) srun_timeout (job_ptr); @@ -6325,31 +6441,10 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc, return false; } - if ((assoc_ptr->grp_wall != NO_VAL) && - (assoc_ptr->grp_wall != INFINITE)) { - time_limit = assoc_ptr->grp_wall; - if (job_desc->time_limit == NO_VAL) { - if (part_ptr->max_time == INFINITE) - job_desc->time_limit = time_limit; - else - job_desc->time_limit = - MIN(time_limit, - part_ptr->max_time); - timelimit_set = 1; - } else if (timelimit_set && - job_desc->time_limit > time_limit) { - job_desc->time_limit = time_limit; - } else if (job_desc->time_limit > time_limit) { - info("job submit for user %s(%u): " - "time limit %u exceeds group " - "time limit %u for account %s", - user_name, - job_desc->user_id, - job_desc->time_limit, time_limit, - assoc_ptr->acct); - return false; - } - } + + /* for validation we don't need to look at + * assoc_ptr->grp_wall. It is checked while the job is running. + */ /* We don't need to look at the regular limits for * parents since we have pre-propogated them, so just diff --git a/src/sreport/cluster_reports.c b/src/sreport/cluster_reports.c index 6a960a0e6f49283fdfb2f3bb19c919e63aacb752..3d331421ef1a474c770a19f5044111f0cea3b7ac 100644 --- a/src/sreport/cluster_reports.c +++ b/src/sreport/cluster_reports.c @@ -392,7 +392,7 @@ static int _setup_print_fields_list(List format_list) field->type = PRINT_CLUSTER_ACCT; field->name = xstrdup("Account"); if(tree_display) - field->len = 20; + field->len = -20; else field->len = 15; field->print_routine = print_fields_str; diff --git a/src/sshare/process.c b/src/sshare/process.c index 2e10e07608f9c30fd5ec0d904e67c4b1c03596e6..c32c4c5699571e0d8fd6536a1cc076c2af131332 100644 --- a/src/sshare/process.c +++ b/src/sshare/process.c @@ -94,7 +94,7 @@ extern int process(shares_response_msg_t *resp) if(!strncasecmp("Account", object, 1)) { field->type = PRINT_ACCOUNT; field->name = xstrdup("Account"); - field->len = 20; + field->len = -20; field->print_routine = print_fields_str; } else if(!strncasecmp("Cluster", object, 1)) { field->type = PRINT_CLUSTER;