diff --git a/NEWS b/NEWS index fc57afd24d6fc3c038742e9562f7c47340ba0aa1..5bd841b75fea17a094d3b543ad765ca2f5da9fff 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,7 @@ documents those changes that are of interest to users and administrators. -- Double Munge connect retry timeout from 1 to 2 seconds. -- sview - Remove unneeded code that was resolved globally in commit 98e24b0dedc. + -- Collect and report the accounting of the batch step and its children. * Changes in Slurm 14.11.0rc2 ============================= diff --git a/src/common/slurm_jobacct_gather.c b/src/common/slurm_jobacct_gather.c index 9b1226af355943ba73bf4d889989be04fa2f8c58..88791100214fed18c11440bedaae7a2066a418ea 100644 --- a/src/common/slurm_jobacct_gather.c +++ b/src/common/slurm_jobacct_gather.c @@ -723,9 +723,11 @@ extern int jobacctinfo_setinfo(jobacctinfo_t *jobacct, break; case JOBACCT_DATA_RUSAGE: - jobacct->user_cpu_sec = rusage->ru_utime.tv_sec; + if (rusage->ru_utime.tv_sec > jobacct->user_cpu_sec) + jobacct->user_cpu_sec = rusage->ru_utime.tv_sec; jobacct->user_cpu_usec = rusage->ru_utime.tv_usec; - jobacct->sys_cpu_sec = rusage->ru_stime.tv_sec; + if (rusage->ru_stime.tv_sec > jobacct->sys_cpu_sec) + jobacct->sys_cpu_sec = rusage->ru_stime.tv_sec; jobacct->sys_cpu_usec = rusage->ru_stime.tv_usec; break; case JOBACCT_DATA_MAX_RSS: diff --git a/src/plugins/jobacct_gather/common/common_jag.c b/src/plugins/jobacct_gather/common/common_jag.c index 7102bcbbfcb0fdeab70e5de084d4a98d8474c3f8..84b6775c7c2f02ef528fb3dfc29fb95e37e60136 100644 --- a/src/plugins/jobacct_gather/common/common_jag.c +++ b/src/plugins/jobacct_gather/common/common_jag.c @@ -269,7 +269,7 @@ static int _get_process_data_line(int in, jag_prec_t *prec) { * RETVAL: ==0 - no valid data * !=0 - data are valid * - * The *prec will mostly be filled in. We need to simply subtract the + * The *prec will mostly be filled in. We need to simply subtract the * amount of shared memory used by the process (in KB) from *prec->rss * and return the updated struct. * @@ -618,8 +618,7 @@ extern void jag_common_poll_data( itr2 = list_iterator_create(prec_list); while ((prec = list_next(itr2))) { if (prec->pid == jobacct->pid) { - uint32_t cpu_calc = - (prec->ssec + prec->usec)/hertz; + uint32_t cpu_calc; #if _DEBUG info("pid:%u ppid:%u rss:%d KB", prec->pid, prec->ppid, prec->rss); @@ -628,7 +627,7 @@ extern void jag_common_poll_data( if (callbacks->get_offspring_data) (*(callbacks->get_offspring_data)) (prec_list, prec, prec->pid); - + cpu_calc = (prec->ssec + prec->usec)/hertz; /* tally their usage */ jobacct->max_rss = MAX(jobacct->max_rss, prec->rss); @@ -652,12 +651,17 @@ extern void jag_common_poll_data( jobacct->min_cpu = MAX(jobacct->min_cpu, cpu_calc); jobacct->last_total_cputime = jobacct->tot_cpu; + /* Update the cpu times + */ jobacct->tot_cpu = cpu_calc; - debug2("%d mem size %"PRIu64" %"PRIu64" " - "time %u(%u+%u)", + jobacct->user_cpu_sec = prec->usec/hertz; + jobacct->sys_cpu_sec = prec->ssec/hertz; + debug2("%s: %d mem size %"PRIu64" %"PRIu64" " + "time %u(%u+%u)", __func__, jobacct->pid, jobacct->max_rss, jobacct->max_vsize, jobacct->tot_cpu, - prec->usec, prec->ssec); + jobacct->user_cpu_sec, + jobacct->sys_cpu_sec); /* compute frequency */ jobacct->this_sampled_cputime = cpu_calc - jobacct->last_total_cputime; @@ -666,13 +670,14 @@ extern void jag_common_poll_data( "cpuinfo_cur_freq", sbuf); jobacct->act_cpufreq = _update_weighted_freq(jobacct, sbuf); - debug2("Task average frequency = %u " + debug2("%s: Task average frequency = %u " "pid %d mem size %"PRIu64" %"PRIu64" " - "time %u(%u+%u)", + "time %u(%u+%u)", __func__, jobacct->act_cpufreq, jobacct->pid, jobacct->max_rss, jobacct->max_vsize, jobacct->tot_cpu, - prec->usec, prec->ssec); + jobacct->user_cpu_sec, + jobacct->sys_cpu_sec); /* get energy consumption * only once is enough since we * report per node energy consumption */