From ecf681642b108bae39f797eb4b44e2df357a7b08 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Fri, 14 Apr 2006 18:44:34 +0000 Subject: [PATCH] finished documentation for jobacct plugin --- NEWS | 8 +- doc/html/big_sys.shtml | 4 +- doc/html/configurator.html | 8 +- doc/html/jobacctplugins.shtml | 273 ++++++++++++++++-- doc/html/quickstart_admin.shtml | 2 + doc/man/man5/slurm.conf.5 | 33 +-- etc/slurm.conf.example | 7 +- src/common/slurm_jobacct.h | 2 +- .../jobacct/common/common_slurmstepd.c | 2 +- src/plugins/jobacct/common/jobacct_common.c | 19 +- src/sacct/print.c | 77 +++-- 11 files changed, 318 insertions(+), 117 deletions(-) diff --git a/NEWS b/NEWS index 57f428c3a71..72a3cb81510 100644 --- a/NEWS +++ b/NEWS @@ -4,15 +4,17 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.1.0-pre5 ============================= -- Added step completion RPC logic - -- Vastly changed sacct and the jobacct plugin. Needs more documentation - still. + -- Vastly changed sacct and the jobacct plugin. Read documentation for full + details. -- Added jobacct plugin for AIX and BlueGene, they currently don't work, - but infastructure is in place. + but infrastructure is in place. -- Add support for srun option --ctrl-comm-ifhn to set PMI communications address (Hongjia Cao, National University of Defense Technology). -- Moved safe_read/write to slurm_protocol_defs.h removing multiple copies. -- Remove vestigial functions slurm_allocate_resources_and_run() and slurm_free_resource_allocation_and_run_response_msg(). + -- moved the way forward logic waited for fanout logic mostly eliminating + problems with scalability issues. * Changes in SLURM 1.1.0-pre4 ============================= diff --git a/doc/html/big_sys.shtml b/doc/html/big_sys.shtml index ef9949cbea1..0aa2e6e0673 100644 --- a/doc/html/big_sys.shtml +++ b/doc/html/big_sys.shtml @@ -21,7 +21,7 @@ is best avoided.</p> <h2>Job Accounting Plugin (JobAcctType)</h2> -<p>Job accounting relies upon the <i>slurmd</i> daemon on each compute +<p>Job accounting relies upon the <i>slurmstepd</i> daemon on each compute node periodically sampling data. This data collection will take compute cycles away from the application inducing what is known as <i>system noise</i>. @@ -32,7 +32,7 @@ is best (<i>jobacct/none</i>). Consider use of job completion records (<i>JobCompType</i>) for accounting purposes as this entails far less overhead. If job accounting is required, configure the sampling interval -to a relatively large size (e.g. <i>JobAcctParameters="Frequency=300"</i>). +to a relatively large size (e.g. <i>JobAcctFrequency=300</i>). Some experimentation may also be required to deal with collisions on data transmission.</p> diff --git a/doc/html/configurator.html b/doc/html/configurator.html index 729bff00e1e..55f8430e395 100644 --- a/doc/html/configurator.html +++ b/doc/html/configurator.html @@ -117,7 +117,7 @@ function displayfile() get_field("JobCompLoc",document.config.job_comp_loc) + "<br>" + "JobAcctType=jobacct/" + get_radio_value(document.config.job_acct_type) + "<br>" + get_field("JobAcctLogfile",document.config.job_acct_logfile) + "<br>" + - "#JobAcctParameters= <br>"+ + get_field("JobAcctFrequency",document.config.job_acct_frequency) + "<br>" + "# <br>" + "# COMPUTE NODES <br>" + "NodeName=" + document.config.node_name.value + @@ -222,7 +222,7 @@ to start up (and may eventually time-out) because the NIS server(s) may not be able to quickly respond to simultaneous requests from multiple slurmd's. You can instruct slurmd to cache /etc/groups entries to prevent this from happening by setting -<B>CacheGroups</B>=1". Reconfiguring ("scontrol reconfig") with +<B>CacheGroups</B>=1. Reconfiguring ("scontrol reconfig") with <B>CacheGroups</B>=0 will cause slurmd to purge the cache. Select one value for <B>CacheGroups</B>:<BR> <input type="radio" name="cache_groups" value="0" checked> @@ -392,9 +392,11 @@ SLURM accounts for resource use per job. System specifics can be polled determined by system type<BR> Select one value for <B>JobAcctType</B>:<BR> <input type="radio" name="job_acct_type" value="none" checked> <B>None</B>: No -extra information gathered<BR> +job accounting<BR> <input type="radio" name="job_acct_type" value="linux"> <B>Linux</B>: Specifc Linux proc table information gathered, use with Linux systems only<BR> +<input type="text" name="job_acct_frequency" value=""> <B>JobAcctFrequency</B>: +polling interval in seconds.<BR> <input type="text" name="job_acct_logfile" value=""> <B>JobAcctLogFile</B>: Location specification. This is the location of the text file to be written to (used by Log only).Use a fully diff --git a/doc/html/jobacctplugins.shtml b/doc/html/jobacctplugins.shtml index f76ae517cb3..8aa51658cbc 100644 --- a/doc/html/jobacctplugins.shtml +++ b/doc/html/jobacctplugins.shtml @@ -5,7 +5,7 @@ <h2> Overview</h2> <p> This document describes SLURM job accounting plugins and the API that defines them. It is intended as a resource to programmers wishing to write -their own SLURM job accounting plugins. This is version 0 of the API. +their own SLURM job accounting plugins. This is version 1 of the API. <p>SLURM job accounting plugins must conform to the @@ -24,62 +24,288 @@ The minor type can be any suitable name for the type of accounting package. We currently use <ul> <li><b>linux</b>—Gathers information from linux proctable and addes this -information to the standard rusage information already gathered for each -job. -<li><b>none</b>—No extra information gathered. +information to the standard rusage information also gathered for each job. +<li><b>none</b>—No information gathered. </ul> The <b>sacct</b> program can be used to display gathered data from regular accounting and from these plugins. <p>The programmer is urged to study -<span class="commandline">src/plugins/jobacct/linux</span> +<span class="commandline">src/plugins/jobacct/linux</span> and +<span class="commandline">src/plugins/jobacct/common</span> for a sample implementation of a SLURM job accounting plugin. <p class="footer"><a href="#top">top</a> <h2>API Functions</h2> -The job accounting API uses hooks in the slurmstepd. +The job accounting API uses hooks in the slurmctld, slurmd, and slurmstepd. <p>All of the following functions are required. Functions which are not implemented must be stubbed. - - <h4>Functions called by all slurmstepd processes</h4> -<p class="commandline">int jobacct_p_init(int frequency) +<p class="commandline">int jobacct_p_startpoll(int frequency) <p style="margin-left:.2in"><b>Description</b>: -jobacct_p_init() is called at the beginning of each process at the start of the -slurmstepd, this starts a thread that should poll information to be gathered -at the end of the process. Put global initialization here. +jobacct_p_startpoll() is called at the start of the slurmstepd, +this starts a thread that should poll information to be queried at any time +during throughout the end of the process. +Put global initialization here. <p style="margin-left:.2in"><b>Arguments</b>: -<span class="commandline">frequency</span> (input) poll frequency for gathering +<span class="commandline">frequency</span> (input) poll frequency for polling thread. <p style="margin-left:.2in"><b>Returns</b>: <span class="commandline">SLURM_SUCCESS</span> on success, or <span class="commandline">SLURM_FAILURE</span> on failure. -<p class="commandline">int jobacct_p_fini(slurmd_job_t *job) +<p class="commandline">int jobacct_p_endpoll() <p style="margin-left:.2in"><b>Description</b>: -jobacct_p_fini() is called when the process is finished gathered information is -placed in the slurmd_job_t structure. This function will also stop the -gathering thread. +jobacct_p_endpoll() is called when the process is finished to stop the +polling thread. <p style="margin-left:.2in"><b>Arguments</b>: -<span class="commandline">job</span> (input) structure to hold information -gathered from gathering thread. +<span class="commandline">none</span> <p style="margin-left:.2in"><b>Returns</b>: <span class="commandline">SLURM_SUCCESS</span> on success, or <span class="commandline">SLURM_FAILURE</span> on failure. -<p class="commandline">int jobacct_p_suspend() +<p class="commandline">void jobacct_p_suspendpoll() <p style="margin-left:.2in"><b>Description</b>: -jobacct_p_suspend() is called when the process is suspended by the slurmctld. +jobacct_p_suspendpoll() is called when the process is suspended or resumed. This causes the polling thread to halt until the process is resumed. <p style="margin-left:.2in"><b>Arguments</b>: <span class="commandline">none</span> <p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">none</span> + +<p class="commandline">int jobacct_p_add_task(pid_t pid, uint16_t tid) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_add_task() used to add a task to the poller. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline"> pid</span> (input) Process id +<span class="commandline"> tid</span> (input) slurm global task id +<p style="margin-left:.2in"><b>Returns</b>: <span class="commandline">SLURM_SUCCESS</span> on success, or <span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="commandline">jobacctinfo_t *jobacct_p_stat_task(pid_t pid) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_stat_task() used to get most recent information about task. +DO NOT FREE the information returned by this function! +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline"> pid</span> (input) Process id +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">jobacctinfo structure pointer</span> on success, or +<span class="commandline">NULL</span> on failure. + +<p class="commandline">int jobacct_p_remove_task(pid_t pid) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_remove_task() used to remove a task from the poller. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline"> pid</span> (input) Process id +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="footer"><a href="#top">top</a> + +<h4>Functions called by the slurmctld process</h4> + +<p class="commandline">int jobacct_p_init_slurmctld(char *job_acct_log) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_init_slurmctld() is called at the start of the slurmctld, +this opens the logfile to be written to. +Put global initialization here. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">job_acct_log</span> (input) logfile name. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="commandline">int jobacct_p_fini_slurmctld() +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_fini_slurmctld() is called at the end of the slurmctld, +this closes the logfile. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">none</span> +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="commandline"> +int jobacct_p_job_start_slurmctld(struct job_record *job_ptr) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_job_start_slurmctld() is called at the allocation of a new job in +the slurmctld, this prints out beginning information about a job. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">job_ptr</span> (input) information about the job in +slurmctld. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="commandline"> +int jobacct_p_job_complete_slurmctld(struct job_record *job_ptr) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_job_complete_slurmctld() is called at the end of a job in +the slurmctld, this prints out ending information about a job. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">job_ptr</span> (input) information about the job in +slurmctld. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="commandline"> +int jobacct_p_step_start_slurmctld(struct step_record *step_ptr) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_step_start_slurmctld() is called at the allocation of a new step in +the slurmctld, this prints out beginning information about a step. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">step_ptr</span> (input) information about the step in +slurmctld. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="commandline"> +int jobacct_p_step_complete_slurmctld(struct step_record *step_ptr) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_step_complete_slurmctld() is called at the end of a step in +the slurmctld, this prints out ending information about a step. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">step_ptr</span> (input) information about the step in +slurmctld. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="commandline"> +int jobacct_p_suspend_slurmctld(struct job_record *job_ptr) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_suspend_slurmctld() is called when a job is suspended or resumed in +the slurmctld, this prints out information about the suspension of the job +to the logfile. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">job_ptr</span> (input) information about the job in +slurmctld. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="footer"><a href="#top">top</a> + +<h4>Functions common to all processes</h4> + +<p class="commandline"> +int jobacct_p_init_struct(jobacctinfo_t *jobacct, uint16_t tid) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_init_struct() is called to set the values of a jobacctinfo_t to +initial values. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">jobacct</span> +(input/output) structure to be altered. +<span class="commandline">tid</span> +(input) id of the task send in (uint16_t)NO_VAL if no specfic task. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="commandline">jobacctinfo_t *jobacct_p_alloc(uint16_t tid) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_alloc() used to alloc a pointer to and initialize a +new jobacctinfo structure.<br> +You will need to free the information returned by this function! +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">tid</span> +(input) id of the task send in (uint16_t)NO_VAL if no specfic task. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">jobacctinfo structure pointer</span> on success, or +<span class="commandline">NULL</span> on failure. + +<p class="commandline">void jobacct_p_free(jobacctinfo_t *jobacct) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_free() used to free the allocation made by jobacct_p_alloc(). +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">jobacct</span> +(input) structure to be freed. +<span class="commandline">none</span> +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">none</span> + +<p class="commandline"> +int jobacct_p_setinfo(jobacctinfo_t *jobacct, + enum jobacct_data_type type, void *data) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_setinfo() is called to set the values of a jobacctinfo_t to +specific values based on inputs. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">jobacct</span> +(input/output) structure to be altered. +<span class="commandline">type</span> +(input) enum of specific part of jobacct to alter. +<span class="commandline">data</span> +(input) corresponding data to set jobacct part to. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="commandline"> +int jobacct_p_getinfo(jobacctinfo_t *jobacct, + enum jobacct_data_type type, void *data) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_getinfo() is called to get the values of a jobacctinfo_t +specific values based on inputs. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">jobacct</span> +(input) structure to be queried. +<span class="commandline">type</span> +(input) enum of specific part of jobacct to get. +<span class="commandline">data</span> +(output) corresponding data to from jobacct part. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + +<p class="commandline"> +void jobacct_p_aggregate(jobacctinfo_t *dest, jobacctinfo_t *from) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_aggregate() is called to aggregate and get max values from two +different jobacctinfo structures. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">dest</span> +(input/output) initial structure to be applied to. +<span class="commandline">from</span> +(input) new info to apply to dest. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">none</span> + +<p class="commandline"> +void jobacct_p_pack(jobacctinfo_t *jobacct, Buf buffer) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_pack() pack jobacctinfo_t in a buffer to send across the network. +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">jobacct</span> +(input) structure to pack. +<span class="commandline">buffer</span> +(input/output) buffer to pack structure into. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">none</span> + +<p class="commandline"> +void jobacct_p_unpack(jobacctinfo_t *jobacct, Buf buffer) +<p style="margin-left:.2in"><b>Description</b>: +jobacct_p_unpack() unpack jobacctinfo_t from a buffer received from +the network. +You will need to free the jobacctinfo_t returned by this function! +<p style="margin-left:.2in"><b>Arguments</b>: +<span class="commandline">jobacct</span> +(input/output) structure to fill. +<span class="commandline">buffer</span> +(input) buffer to unpack structure from. +<p style="margin-left:.2in"><b>Returns</b>: +<span class="commandline">SLURM_SUCCESS</span> on success, or +<span class="commandline">SLURM_FAILURE</span> on failure. + <p class="footer"><a href="#top">top</a> <h2>Parameters</h2> @@ -90,11 +316,10 @@ plugins, the job accounting API counts on three parameters: <dd>Specifies which plugin should be used. <dt><span class="commandline">JobAcctFrequency</span> <dd>Let the plugin know how long between pollings. +<dt><span class="commandline">JobAcctLogFile</span> +<dd>Let the plugin the name of the logfile to use. </dl> -<p class="footer"><a href="#top">top</a> - - <h2>Versioning</h2> <p> This document describes version 1 of the SLURM Job Accounting API. Future releases of SLURM may revise this API. A job accounting plugin conveys its diff --git a/doc/html/quickstart_admin.shtml b/doc/html/quickstart_admin.shtml index 570f3b75d09..bee2b436f69 100644 --- a/doc/html/quickstart_admin.shtml +++ b/doc/html/quickstart_admin.shtml @@ -533,6 +533,8 @@ SlurmdSpoolDir = /tmp/slurmd SlurmdTimeout = 300 TreeWidth = 50 JobAcctLogFile = /tmp/jobacct.log +JobAcctFrequncy = 5 +JobAcctType = jobacct/linux SLURM_CONFIG_FILE = /etc/slurm/slurm.conf StateSaveLocation = /usr/local/tmp/slurm/adev SwitchType = switch/elan diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 0b1430c8404..cc03977227a 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -141,38 +141,17 @@ May not exceed 65533. Define the job accounting mechanism type. Acceptable values at present include "jobacct/linux" and "jobacct/none". The default value is "jobacct/none". -In order to use the \fBsacct\fR tool, "jobacct/linux" must be configured.a -\fBJobAcctLoc\fR +In order to use the \fBsacct\fR tool, "jobacct/linux" must be configured. +.TP +\fBJobAcctLogFile\fR Define the location where job accounting logs are to be written. For jobacct/none this parameter is ignored. For jobacct/linux this is the fully-qualified file name for the data file. .TP -\fBJobAcctParameters\fR -Define any parameters to pass to the job accounting plugin, in a -quoted and comma-separated list. +\fBJobAcctFrequency\fR +Define the polling frequencys to pass to the job accounting plugin. For jobacct/none this parameter is ignored. -For jobacct/linux the parameters available are -.RS -.TP -\fBFrequency=N\fR, where N is the number of seconds between -sampling the memory usage stats (psize, vsize). If N=0, no sampling -is done and psize and vsize are returned as 0. -.TP -\fBMaxSendRetries=N\fR, where N is the number of times to try to send -an accounting message before giving up. -.TP -\fBMaxSendRetryDelay=N\fR, pause for 1 to MaxSendRetryDelay -seconds between attempts to deliver an -accounting message. -.TP -\fBStaggerSlotSize=N\fR - For a process that might be sending -a message at the same time as N other processes in the job, where N is -10 or greater, each process will pause a bit before trying to send its -message. For N tasks, N "staggered timeslots" are defined, in increments -of (StaggerSlotSize*.001) seconds. The first process sends its message -immediately, the second process pauses one increment before sending, -the third process pauses two increments before sending, and so on. -.RE +For jobacct/linux the parameter is a number is seconds between polls. .TP \fBJobCompLoc\fR The interpretation of this value depends upon the logging mechanism diff --git a/etc/slurm.conf.example b/etc/slurm.conf.example index 5643a02dd3c..0488cfa5d43 100644 --- a/etc/slurm.conf.example +++ b/etc/slurm.conf.example @@ -270,9 +270,9 @@ # # o Define the job accounting mechanism to use. # -# "jobacct/linux" : Instigates a polling thread to gather information +# "jobacct/linux" : Job accouting information # from the linux proc table -# "jobacct/none" : Do not have polling thread. +# "jobacct/none" : No job accouting information. # JobAcctType=jobacct/none @@ -285,9 +285,8 @@ JobAcctType=jobacct/none # # o Define the log file for job accounting this will be written on the # same node the slurmctld is being ran on. -# This log is kept even if JobAcctType=jobacct/none # -JobAcctLogFile=/var/log/slurm_accounting.log +#JobAcctLogFile=/var/log/slurm_jobacct.log # # o Define the places to look for SLURM plugins. This is a diff --git a/src/common/slurm_jobacct.h b/src/common/slurm_jobacct.h index f9fb83f7183..a04c6986c3e 100644 --- a/src/common/slurm_jobacct.h +++ b/src/common/slurm_jobacct.h @@ -55,7 +55,7 @@ typedef struct slurm_jobacct_context * slurm_jobacct_context_t; -/* common for both slurmctld and slurmstepd */ +/* common */ extern int jobacct_g_init_struct(jobacctinfo_t *jobacct, uint16_t tid); extern jobacctinfo_t *jobacct_g_alloc(uint16_t tid); extern void jobacct_g_free(jobacctinfo_t *jobacct); diff --git a/src/plugins/jobacct/common/common_slurmstepd.c b/src/plugins/jobacct/common/common_slurmstepd.c index 9d4655dc72b..62184d75392 100644 --- a/src/plugins/jobacct/common/common_slurmstepd.c +++ b/src/plugins/jobacct/common/common_slurmstepd.c @@ -37,7 +37,7 @@ pthread_mutex_t jobacct_lock = PTHREAD_MUTEX_INITIALIZER; extern int common_endpoll() { fini = true; - + return SLURM_SUCCESS; } extern int common_add_task(pid_t pid, uint16_t tid) diff --git a/src/plugins/jobacct/common/jobacct_common.c b/src/plugins/jobacct/common/jobacct_common.c index 29d79a37786..af562e86f52 100644 --- a/src/plugins/jobacct/common/jobacct_common.c +++ b/src/plugins/jobacct/common/jobacct_common.c @@ -239,7 +239,15 @@ extern void common_aggregate(struct jobacctinfo *dest, } dest->tot_pages += from->tot_pages; - + if((dest->min_cpu > from->min_cpu) + || (dest->min_cpu == (uint32_t)NO_VAL)) { + if(from->min_cpu == (uint32_t)NO_VAL) + from->min_cpu = 0; + dest->min_cpu = from->min_cpu; + dest->min_cpu_task = from->min_cpu_task; + } + dest->tot_cpu += from->tot_cpu; + if(dest->max_vsize_task == (uint16_t)NO_VAL) dest->max_vsize_task = from->max_vsize_task; @@ -266,15 +274,6 @@ extern void common_aggregate(struct jobacctinfo *dest, dest->rusage.ru_stime.tv_usec -= 1E6; } - if((dest->min_cpu > from->min_cpu) - || (dest->min_cpu == (uint32_t)NO_VAL)) { - if(from->min_cpu == (uint32_t)NO_VAL) - from->min_cpu = 0; - dest->min_cpu = from->min_cpu; - dest->min_cpu_task = from->min_cpu_task; - } - dest->tot_cpu += from->tot_cpu; - dest->rusage.ru_maxrss += from->rusage.ru_maxrss; dest->rusage.ru_ixrss += from->rusage.ru_ixrss; dest->rusage.ru_idrss += from->rusage.ru_idrss; diff --git a/src/sacct/print.c b/src/sacct/print.c index bda72562396..be806f36ffa 100644 --- a/src/sacct/print.c +++ b/src/sacct/print.c @@ -27,49 +27,40 @@ \*****************************************************************************/ #include "sacct.h" +#define FORMAT_STRING_SIZE 32 char *_decode_status(int status); char *_elapsed_time(long secs, long usecs); char *_elapsed_time(long secs, long usecs) { - int days, hours, minutes, seconds; - char daybuf[10], - hourbuf[4], - minbuf[4]; - static char outbuf[20]; /* this holds LOTS of time! */ - div_t res; - - daybuf[0] = 0; - hourbuf[0] = 0; - minbuf[0] = 0; - - res = div(usecs+5000, 1e6); /* round up the usecs, then */ - usecs /= 1e4; /* truncate to .00's */ - - res = div(secs+res.quot, 60*60*24); /* 1 day is 24 hours of 60 - minutes of 60 seconds */ - days = res.quot; - res = div(res.rem, 60*60); - hours = res.quot; - res = div(res.rem, 60); - minutes = res.quot; - seconds = res.rem; - if (days) { - snprintf(daybuf, sizeof(daybuf), "%d-", days); - snprintf(hourbuf, sizeof(hourbuf), "%02d:", hours); - } else if (hours) - snprintf(hourbuf, sizeof(hourbuf), "%2d:", hours); - if (days || hours) - snprintf(minbuf, sizeof(minbuf), "%02d:", minutes); - else if (minutes) - snprintf(minbuf, sizeof(minbuf), "%2d:", minutes); - if (days || hours || minutes) - snprintf(outbuf, sizeof(outbuf), "%s%s%s%02d.%02ld", - daybuf, hourbuf, minbuf, seconds, usecs); + static char str[FORMAT_STRING_SIZE]; + long days, hours, minutes, seconds; + + while (usecs >= 1E6) { + secs++; + usecs -= 1E6; + } + + seconds = secs % 60; + minutes = (secs / 60) % 60; + hours = (secs / 3600) % 24; + days = secs / 86400; + + if (days) + snprintf(str, FORMAT_STRING_SIZE, + "%ld-%2.2ld:%2.2ld:%2.2ld", + days, hours, minutes, seconds); + else if (hours) + snprintf(str, FORMAT_STRING_SIZE, + "%ld:%2.2ld:%2.2ld", + hours, minutes, seconds); else - snprintf(outbuf, sizeof(outbuf), "%2d.%02ld", seconds, usecs); - return(outbuf); + snprintf(str, FORMAT_STRING_SIZE, + "%ld:%2.2ld", + minutes, seconds); + + return str; } void print_fields(type_t type, void *object) @@ -701,10 +692,10 @@ void print_rss(type_t type, void *object) switch(type) { case HEADLINE: - printf("%8s", "Rss"); + printf("%22s", "MAX_RSS/Task - AVE"); break; case UNDERSCORE: - printf("%8s", "------"); + printf("%22s", "----------------------"); break; case JOB: printf("%8ld", job->rusage.ru_maxrss); @@ -887,16 +878,18 @@ void print_cputime(type_t type, void *object) switch(type) { case HEADLINE: - printf("%10s", "SystemTime"); + printf("%15s", "SystemTime"); break; case UNDERSCORE: - printf("%10s", "------"); + printf("%15s", "----------"); break; case JOB: - printf("%10.2f", job->sacct.min_cpu); + printf("%15s", + _elapsed_time((int)job->sacct.min_cpu, 0)); break; case JOBSTEP: - printf("%10.2f", step->sacct.min_cpu); + printf("%15s", + _elapsed_time((int)step->sacct.min_cpu, 0)); break; } } -- GitLab