From 137dffa846ddb7888c94a32b6750b25e206131c4 Mon Sep 17 00:00:00 2001 From: Yiannis Georgiou <yiannis.georgiou@bull.net> Date: Wed, 5 Dec 2012 12:19:48 -0800 Subject: [PATCH] Energy RAPL updates 1)updated scontrol show node to report "n/s" (not supported) for energy/power values in case acct_gather_energy/rapl is configured and nodes do not support RAPL Previously, these values were reported as zero. The scontrol man page has been also updated to explain this. 2)updated sstat / sacct to report no value for ConsumedEnergy in case acct_gather_energy/rapl is configured, a job/step is allocated and at least one node does not support RAPL. Previously for this case, sstat and sacct reported a partial value for ConsumedEnergy. 3)updated the logging mechanism so that If acct_gather_energy/rapl is configured and a job/step is allocated any nodes that do not support RAPL, a single debug entry is made in the log file. Previously, multiple repeating error messages were sent to the console and the log file. --- doc/man/man1/scontrol.1 | 7 ++++++- src/api/node_info.c | 12 ++++++++---- src/common/slurm_jobacct_gather.c | 6 ++++++ .../rapl/acct_gather_energy_rapl.c | 14 +++++++++++++- src/sacct/process.c | 3 +++ src/sstat/print.c | 6 ++++-- src/sstat/process.c | 3 +++ 7 files changed, 43 insertions(+), 8 deletions(-) diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 68f187fb888..286daa80652 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -1298,13 +1298,18 @@ energy accounting sample, in watts. .TP \fILowestJoules\fP The energy consumed by the node between the last time it was powered on and -the last time it was registered by the slurmd daemon, in joules. +the last time it was registered by slurmd, in joules. .TP \fIConsumedJoules\fP The energy consumed by the node between the last time it was registered by the slurmd daemon and the last node energy accounting sample, in joules. +.PP +If the reported value is "n/s" (not supported), the node does not support the +configured \fBAcctGatherEnergyType\fR plugin. If the reported value is zero, energy +accounting for nodes is disabled. + .SH "ENVIRONMENT VARIABLES" .PP Some \fBscontrol\fR options may diff --git a/src/api/node_info.c b/src/api/node_info.c index b0a112eb078..47082ff782d 100644 --- a/src/api/node_info.c +++ b/src/api/node_info.c @@ -293,11 +293,15 @@ slurm_sprint_node_table (node_info_t * node_ptr, xstrcat(out, "\n "); /****** power Line ******/ - snprintf(tmp_line, sizeof(tmp_line), "CurrentWatts=%u LowestJoules=%u " - "ConsumedJoules=%u", - node_ptr->energy->current_watts, node_ptr->energy->base_watts, + if (node_ptr->energy->current_watts == NO_VAL) + snprintf(tmp_line, sizeof(tmp_line), "CurrentWatts=n/s " + "LowestJoules=n/s ConsumedJoules=n/s"); + else + snprintf(tmp_line, sizeof(tmp_line), "CurrentWatts=%u " + "LowestJoules=%u ConsumedJoules=%u", + node_ptr->energy->current_watts, + node_ptr->energy->base_watts, node_ptr->energy->consumed_energy); - xstrcat(out, tmp_line); if (one_liner) diff --git a/src/common/slurm_jobacct_gather.c b/src/common/slurm_jobacct_gather.c index 5a334d897a3..ea6fd699837 100644 --- a/src/common/slurm_jobacct_gather.c +++ b/src/common/slurm_jobacct_gather.c @@ -1031,6 +1031,9 @@ extern void jobacctinfo_aggregate(jobacctinfo_t *dest, jobacctinfo_t *from) dest->sys_cpu_usec -= 1E6; } dest->act_cpufreq += from->act_cpufreq; + if (from->energy.consumed_energy == NO_VAL) + dest->energy.consumed_energy = NO_VAL; + else dest->energy.consumed_energy += from->energy.consumed_energy; } @@ -1056,5 +1059,8 @@ extern void jobacctinfo_2_stats(slurmdb_stats_t *stats, jobacctinfo_t *jobacct) stats->cpu_min_taskid = jobacct->min_cpu_id.taskid; stats->cpu_ave = (double)jobacct->tot_cpu; stats->act_cpufreq = (double)jobacct->act_cpufreq; + if (jobacct->energy.consumed_energy == NO_VAL) + stats->consumed_energy = NO_VAL; + else stats->consumed_energy = (double)jobacct->energy.consumed_energy; } diff --git a/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c b/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c index 42e376d4d37..371a400932b 100644 --- a/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c +++ b/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c @@ -145,7 +145,7 @@ static uint64_t _read_msr(int fd, int which) "this can be common. Check your system " "if you think this is in error."); } else { - error("Check if your CPU has RAPL support for %s: %m", + debug("Check if your CPU has RAPL support for %s: %m", _msr_string(which)); } } @@ -253,6 +253,8 @@ extern int acct_gather_energy_p_update_node_energy(void) uint64_t result; double ret; + if (local_energy->current_watts == NO_VAL) + return rc; acct_gather_energy_shutdown = false; if (!acct_gather_energy_shutdown) { uint32_t node_current_energy; @@ -372,11 +374,18 @@ static void _get_joules_task(acct_gather_energy_t *energy) extern int init(void) { int i; + uint64_t result; + _hardware(); for (i = 0; i < nb_pkg; i++) pkg_fd[i] = _open_msr(pkg2cpu[i]); local_energy = acct_gather_energy_alloc(); + + result = _read_msr(pkg_fd[0], MSR_RAPL_POWER_UNIT); + if (result == 0) + local_energy->current_watts = NO_VAL; + debug_flags = slurm_get_debug_flags(); verbose("%s loaded", plugin_name); return SLURM_SUCCESS; @@ -404,6 +413,9 @@ extern int acct_gather_energy_p_get_data(enum acct_energy_type data_type, int rc = SLURM_SUCCESS; switch (data_type) { case ENERGY_DATA_JOULES_TASK: + if (local_energy->current_watts == NO_VAL) + energy->consumed_energy = NO_VAL; + else _get_joules_task(energy); break; case ENERGY_DATA_STRUCT: diff --git a/src/sacct/process.c b/src/sacct/process.c index 1e74544795a..50302e285d4 100644 --- a/src/sacct/process.c +++ b/src/sacct/process.c @@ -92,6 +92,9 @@ void aggregate_stats(slurmdb_stats_t *dest, slurmdb_stats_t *from) dest->cpu_min_taskid = from->cpu_min_taskid; } dest->cpu_ave += from->cpu_ave; + if ((from->consumed_energy == NO_VAL) || (dest->consumed_energy == NO_VAL)) + dest->consumed_energy = NO_VAL; + else dest->consumed_energy += from->consumed_energy; dest->act_cpufreq += from->act_cpufreq; } diff --git a/src/sstat/print.c b/src/sstat/print.c index 10377d5947f..92bcaa9076e 100644 --- a/src/sstat/print.c +++ b/src/sstat/print.c @@ -90,6 +90,7 @@ void print_fields(slurmdb_step_rec_t *step) while ((field = list_next(print_fields_itr))) { char *tmp_char = NULL; + memset(&outbuf, 0, sizeof(outbuf)); switch(field->type) { case PRINT_AVECPU: @@ -111,8 +112,9 @@ void print_fields(slurmdb_step_rec_t *step) (curr_inx == field_count)); break; case PRINT_CONSUMED_ENERGY: - - convert_num_unit((float)step->stats.consumed_energy, + if (!fuzzy_equal(step->stats.consumed_energy, NO_VAL)) + convert_num_unit((float) + step->stats.consumed_energy, outbuf, sizeof(outbuf), UNIT_NONE); diff --git a/src/sstat/process.c b/src/sstat/process.c index 9f11b0ddf92..d3232ab8ded 100644 --- a/src/sstat/process.c +++ b/src/sstat/process.c @@ -89,6 +89,9 @@ void aggregate_stats(slurmdb_stats_t *dest, slurmdb_stats_t *from) dest->cpu_min_taskid = from->cpu_min_taskid; } dest->cpu_ave += from->cpu_ave; + if ((from->consumed_energy == NO_VAL) || (dest->consumed_energy == NO_VAL)) + dest->consumed_energy = NO_VAL; + else dest->consumed_energy += from->consumed_energy; dest->act_cpufreq += from->act_cpufreq; } -- GitLab