Skip to content
Snippets Groups Projects
Commit 137dffa8 authored by Yiannis Georgiou's avatar Yiannis Georgiou Committed by Morris Jette
Browse files

Energy RAPL updates

1)updated scontrol show node to report "n/s" (not supported) for energy/power values in case  acct_gather_energy/rapl is configured and nodes do not support RAPL Previously, these values were reported as zero. The scontrol man page has been also updated to explain this.

2)updated sstat / sacct to report no value for ConsumedEnergy in case acct_gather_energy/rapl is configured, a job/step is allocated and at least one node does not support RAPL. Previously for this case, sstat and sacct reported a partial value for ConsumedEnergy.

3)updated the logging mechanism so that If acct_gather_energy/rapl is configured and a job/step is allocated any nodes that do not support RAPL, a single debug entry is made in the log file.  Previously, multiple repeating error messages were sent to the console and the log file.
parent 7dfc6d87
No related branches found
No related tags found
No related merge requests found
......@@ -1298,13 +1298,18 @@ energy accounting sample, in watts.
.TP
\fILowestJoules\fP
The energy consumed by the node between the last time it was powered on and
the last time it was registered by the slurmd daemon, in joules.
the last time it was registered by slurmd, in joules.
.TP
\fIConsumedJoules\fP
The energy consumed by the node between the last time it was registered by
the slurmd daemon and the last node energy accounting sample, in joules.
.PP
If the reported value is "n/s" (not supported), the node does not support the
configured \fBAcctGatherEnergyType\fR plugin. If the reported value is zero, energy
accounting for nodes is disabled.
.SH "ENVIRONMENT VARIABLES"
.PP
Some \fBscontrol\fR options may
......
......@@ -293,11 +293,15 @@ slurm_sprint_node_table (node_info_t * node_ptr,
xstrcat(out, "\n ");
/****** power Line ******/
snprintf(tmp_line, sizeof(tmp_line), "CurrentWatts=%u LowestJoules=%u "
"ConsumedJoules=%u",
node_ptr->energy->current_watts, node_ptr->energy->base_watts,
if (node_ptr->energy->current_watts == NO_VAL)
snprintf(tmp_line, sizeof(tmp_line), "CurrentWatts=n/s "
"LowestJoules=n/s ConsumedJoules=n/s");
else
snprintf(tmp_line, sizeof(tmp_line), "CurrentWatts=%u "
"LowestJoules=%u ConsumedJoules=%u",
node_ptr->energy->current_watts,
node_ptr->energy->base_watts,
node_ptr->energy->consumed_energy);
xstrcat(out, tmp_line);
if (one_liner)
......
......@@ -1031,6 +1031,9 @@ extern void jobacctinfo_aggregate(jobacctinfo_t *dest, jobacctinfo_t *from)
dest->sys_cpu_usec -= 1E6;
}
dest->act_cpufreq += from->act_cpufreq;
if (from->energy.consumed_energy == NO_VAL)
dest->energy.consumed_energy = NO_VAL;
else
dest->energy.consumed_energy += from->energy.consumed_energy;
}
......@@ -1056,5 +1059,8 @@ extern void jobacctinfo_2_stats(slurmdb_stats_t *stats, jobacctinfo_t *jobacct)
stats->cpu_min_taskid = jobacct->min_cpu_id.taskid;
stats->cpu_ave = (double)jobacct->tot_cpu;
stats->act_cpufreq = (double)jobacct->act_cpufreq;
if (jobacct->energy.consumed_energy == NO_VAL)
stats->consumed_energy = NO_VAL;
else
stats->consumed_energy = (double)jobacct->energy.consumed_energy;
}
......@@ -145,7 +145,7 @@ static uint64_t _read_msr(int fd, int which)
"this can be common. Check your system "
"if you think this is in error.");
} else {
error("Check if your CPU has RAPL support for %s: %m",
debug("Check if your CPU has RAPL support for %s: %m",
_msr_string(which));
}
}
......@@ -253,6 +253,8 @@ extern int acct_gather_energy_p_update_node_energy(void)
uint64_t result;
double ret;
if (local_energy->current_watts == NO_VAL)
return rc;
acct_gather_energy_shutdown = false;
if (!acct_gather_energy_shutdown) {
uint32_t node_current_energy;
......@@ -372,11 +374,18 @@ static void _get_joules_task(acct_gather_energy_t *energy)
extern int init(void)
{
int i;
uint64_t result;
_hardware();
for (i = 0; i < nb_pkg; i++)
pkg_fd[i] = _open_msr(pkg2cpu[i]);
local_energy = acct_gather_energy_alloc();
result = _read_msr(pkg_fd[0], MSR_RAPL_POWER_UNIT);
if (result == 0)
local_energy->current_watts = NO_VAL;
debug_flags = slurm_get_debug_flags();
verbose("%s loaded", plugin_name);
return SLURM_SUCCESS;
......@@ -404,6 +413,9 @@ extern int acct_gather_energy_p_get_data(enum acct_energy_type data_type,
int rc = SLURM_SUCCESS;
switch (data_type) {
case ENERGY_DATA_JOULES_TASK:
if (local_energy->current_watts == NO_VAL)
energy->consumed_energy = NO_VAL;
else
_get_joules_task(energy);
break;
case ENERGY_DATA_STRUCT:
......
......@@ -92,6 +92,9 @@ void aggregate_stats(slurmdb_stats_t *dest, slurmdb_stats_t *from)
dest->cpu_min_taskid = from->cpu_min_taskid;
}
dest->cpu_ave += from->cpu_ave;
if ((from->consumed_energy == NO_VAL) || (dest->consumed_energy == NO_VAL))
dest->consumed_energy = NO_VAL;
else
dest->consumed_energy += from->consumed_energy;
dest->act_cpufreq += from->act_cpufreq;
}
......@@ -90,6 +90,7 @@ void print_fields(slurmdb_step_rec_t *step)
while ((field = list_next(print_fields_itr))) {
char *tmp_char = NULL;
memset(&outbuf, 0, sizeof(outbuf));
switch(field->type) {
case PRINT_AVECPU:
......@@ -111,8 +112,9 @@ void print_fields(slurmdb_step_rec_t *step)
(curr_inx == field_count));
break;
case PRINT_CONSUMED_ENERGY:
convert_num_unit((float)step->stats.consumed_energy,
if (!fuzzy_equal(step->stats.consumed_energy, NO_VAL))
convert_num_unit((float)
step->stats.consumed_energy,
outbuf, sizeof(outbuf),
UNIT_NONE);
......
......@@ -89,6 +89,9 @@ void aggregate_stats(slurmdb_stats_t *dest, slurmdb_stats_t *from)
dest->cpu_min_taskid = from->cpu_min_taskid;
}
dest->cpu_ave += from->cpu_ave;
if ((from->consumed_energy == NO_VAL) || (dest->consumed_energy == NO_VAL))
dest->consumed_energy = NO_VAL;
else
dest->consumed_energy += from->consumed_energy;
dest->act_cpufreq += from->act_cpufreq;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment