From a7e03592e0e32faefb75562248f314cbb5379767 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Tue, 12 Apr 2016 10:20:32 -0700 Subject: [PATCH] power/cray - returned to operation power/cray - Fix bug introduced in 15.08.10 preventin operation in many cases. bug 2628 --- NEWS | 2 + src/plugins/power/common/power_common.c | 2 + src/plugins/power/cray/power_cray.c | 61 +++++++++++++++++++++++-- 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/NEWS b/NEWS index 2e1bac7290f..ce1d6a68869 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,8 @@ documents those changes that are of interest to users and administrators. -- burst_buffer/cray - Fix for script creating or deleting persistent buffer would fail "paths" operation and hold the job. -- power/cray - Prevent possible divide by zero. + -- power/cray - Fix bug introduced in 15.08.10 preventin operation in many + cases. * Changes in Slurm 15.08.10 =========================== diff --git a/src/plugins/power/common/power_common.c b/src/plugins/power/common/power_common.c index 923262d32a1..d1b60805c0f 100644 --- a/src/plugins/power/common/power_common.c +++ b/src/plugins/power/common/power_common.c @@ -240,6 +240,8 @@ extern char *power_run_script(char *script_name, char *script_path, script_argv[3], script_argv[4], script_argv[5], script_argv[6], script_argv[7]); } + if (data_in) + info("%s: %s", __func__, data_in); } if (script_path[0] != '/') { error("%s: %s is not fully qualified pathname (%s)", diff --git a/src/plugins/power/cray/power_cray.c b/src/plugins/power/cray/power_cray.c index 1afce377eaa..3b99527b2b4 100644 --- a/src/plugins/power/cray/power_cray.c +++ b/src/plugins/power/cray/power_cray.c @@ -136,6 +136,7 @@ static char *capmc_path = NULL; static uint32_t cap_watts = DEFAULT_CAP_WATTS; static uint32_t set_watts = 0; static uint64_t debug_flag = 0; +static char *full_nid_string = NULL; static uint32_t decrease_rate = DEFAULT_DECREASE_RATE; static uint32_t increase_rate = DEFAULT_INCREASE_RATE; static uint32_t job_level = NO_VAL; @@ -153,6 +154,7 @@ static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t term_cond = PTHREAD_COND_INITIALIZER; /*********************** local functions *********************/ +static void _build_full_nid_string(void); static void _clear_node_caps(void); static void _get_capabilities(void); static void _get_caps(void); @@ -369,6 +371,7 @@ static void _load_config(void) } xfree(sched_params); + xfree(full_nid_string); if (debug_flag & DEBUG_FLAG_POWER) { char *level_str = ""; if (job_level == 0) @@ -658,6 +661,45 @@ static void _json_parse_capabilities(json_object *jobj, } } +static void _build_full_nid_string(void) +{ + /* Read nodes */ + slurmctld_lock_t read_node_lock = { + NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK }; + struct node_record *node_ptr; + hostset_t hs = NULL; + char *sep, *tmp_str; + int i, num_ent = 0; + + if (full_nid_string) + return; + + lock_slurmctld(read_node_lock); + for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; + i++, node_ptr++) { + if (!hs) + hs = hostset_create(_node_name2nid(node_ptr->name)); + else + hostset_insert(hs, _node_name2nid(node_ptr->name)); + num_ent++; + } + unlock_slurmctld(read_node_lock); + if (!hs) { + error("%s: No nodes found", __func__); + return; + } + tmp_str = xmalloc(node_record_count * 6 + 2); + (void) hostset_ranged_string(hs, num_ent * 6, tmp_str); + hostset_destroy(hs); + if ((sep = strrchr(tmp_str, ']'))) + sep[0] = '\0'; + if (tmp_str[0] == '[') + full_nid_string = xstrdup(tmp_str + 1); + else + full_nid_string = xstrdup(tmp_str); + xfree(tmp_str); +} + static void _get_caps(void) { /* Write nodes */ @@ -713,7 +755,7 @@ static void _get_caps(void) for (i = 0; i < num_ent; i++) { node_ptr = find_node_record2(ents[i].node_name[0]); if (!node_ptr) { - debug("%s: Node %s not in Slurm config", + debug2("%s: Node %s not in Slurm config", __func__, ents[i].node_name[0]); } else { if (!node_ptr->power) { @@ -722,7 +764,7 @@ static void _get_caps(void) } node_ptr->power->cap_watts = ents[i].cap_watts; } - xfree(ents[i].node_name[0]); + xfree(ents[i].node_name[0]); /* FUTURE: array of node names */ xfree(ents[i].node_name); } xfree(ents); @@ -1020,20 +1062,28 @@ static void _get_node_energy_counter(void) struct node_record *node_ptr; DEF_TIMERS; + _build_full_nid_string(); + if (!full_nid_string) + return; + script_argv[0] = capmc_path; script_argv[1] = "get_node_energy_counter"; - script_argv[2] = NULL; + script_argv[2] = "--nids"; + script_argv[3] = full_nid_string; + script_argv[4] = NULL; START_TIMER; cmd_resp = power_run_script("capmc", capmc_path, script_argv, get_timeout, NULL, &status); END_TIMER; if (status != 0) { - error("%s: capmc %s: %s", __func__, script_argv[1], cmd_resp); + error("%s: capmc %s %s %s: %s", __func__, + script_argv[1], script_argv[2], script_argv[3], cmd_resp); xfree(cmd_resp); return; } else if (debug_flag & DEBUG_FLAG_POWER) { - info("%s: capmc %s %s", __func__, script_argv[1], TIME_STR); + info("%s: capmc %s %s %s %s", __func__, + script_argv[1], script_argv[2], script_argv[3], TIME_STR); } if ((cmd_resp == NULL) || (cmd_resp[0] == '\0')) { xfree(cmd_resp); @@ -1698,6 +1748,7 @@ extern void fini(void) pthread_join(power_thread, NULL); power_thread = 0; xfree(capmc_path); + xfree(full_nid_string); } pthread_mutex_unlock(&thread_flag_mutex); } -- GitLab