diff --git a/NEWS b/NEWS index ac40f8b42a6384c0e87e77c582dc7925dfd9d538..120515874c9f95f2b7099aa8dce32569f49e0a24 100644 --- a/NEWS +++ b/NEWS @@ -48,6 +48,8 @@ documents those changes that are of interest to users and admins. -- Corrected the sh5util program to print the header in the csv file only once, set the debug messages at debug() level, make the argument check case insensitive and avoid printing duplicate \n. + -- If cannot collect energy values send message to the controller + to drain the node and log error slurmd log file. * Changes in Slurm 2.6.0 ======================== diff --git a/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c b/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c index f242b446ef1c16f94220ff753ad242dff1a9378a..a7c91e509179287900623eb4e5a8aacf2e286a23 100644 --- a/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c +++ b/src/plugins/acct_gather_energy/rapl/acct_gather_energy_rapl.c @@ -137,7 +137,7 @@ static uint32_t debug_flags = 0; /* one cpu in the package */ static int pkg2cpu[MAX_PKGS] = {[0 ... MAX_PKGS-1] -1}; static int pkg_fd[MAX_PKGS] = {[0 ... MAX_PKGS-1] -1}; - +static char hostname[MAXHOSTNAMELEN]; static int nb_pkg = 0; @@ -280,6 +280,31 @@ static bool _run_in_daemon(void) return run; } +/* _send_drain_request() + */ +static void +_send_drain_request(void) +{ + update_node_msg_t node_msg; + static char drain_request_sent; + + if (drain_request_sent) + return; + + slurm_init_update_node_msg(&node_msg); + node_msg.node_names = hostname; + node_msg.reason = "Cannot collect energy data."; + node_msg.node_state = NODE_STATE_DRAIN; + + drain_request_sent = 1; + debug("%s: sending NODE_STATE_DRAIN to controller", __func__); + + if (slurm_update_node(&node_msg) != SLURM_SUCCESS) { + error("%s: Unable to drain node %s: %m", __func__, hostname); + drain_request_sent = 0; + } +} + static void _get_joules_task(acct_gather_energy_t *energy) { int i; @@ -287,7 +312,12 @@ static void _get_joules_task(acct_gather_energy_t *energy) uint64_t result; double ret; - xassert(pkg_fd[0] != -1); + if (pkg_fd[0] < 0) { + error("%s: device /dev/cpu/#msr not opened " + "energy data cannot be collected.", __func__); + _send_drain_request(); + return; + } /* MSR_RAPL_POWER_UNIT * Power Units - bits 3:0 @@ -406,6 +436,8 @@ extern int init(void) { debug_flags = slurm_get_debug_flags(); + gethostname(hostname, MAXHOSTNAMELEN); + /* put anything that requires the .conf being read in acct_gather_energy_p_conf_parse */ diff --git a/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c b/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c index 708cd6064eef9647707e4d3ca8cc4205b92c451c..df3c0bd94dea14bd3b196dab7869888dce410703 100644 --- a/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c +++ b/src/plugins/acct_gather_profile/hdf5/sh5util/sh5util.c @@ -1171,13 +1171,13 @@ int main (int argc, char **argv) profile_init(); switch (params.mode) { case SH5UTIL_MODE_MERGE: - debug("Merging node-step files into %s", - params.output); + info("Merging node-step files into %s", + params.output); _merge_step_files(); break; case SH5UTIL_MODE_EXTRACT: - debug("Extracting job data from %s into %s", - params.input, params.output); + info("Extracting job data from %s into %s", + params.input, params.output); _extract_data(); break; default: