Skip to content
Snippets Groups Projects
Commit a7e03592 authored by Morris Jette's avatar Morris Jette
Browse files

power/cray - returned to operation

power/cray - Fix bug introduced in 15.08.10 preventin operation in many
    cases.
bug 2628
parent b6a8373c
No related branches found
No related tags found
No related merge requests found
...@@ -15,6 +15,8 @@ documents those changes that are of interest to users and administrators. ...@@ -15,6 +15,8 @@ documents those changes that are of interest to users and administrators.
-- burst_buffer/cray - Fix for script creating or deleting persistent buffer -- burst_buffer/cray - Fix for script creating or deleting persistent buffer
would fail "paths" operation and hold the job. would fail "paths" operation and hold the job.
-- power/cray - Prevent possible divide by zero. -- power/cray - Prevent possible divide by zero.
-- power/cray - Fix bug introduced in 15.08.10 preventin operation in many
cases.
* Changes in Slurm 15.08.10 * Changes in Slurm 15.08.10
=========================== ===========================
......
...@@ -240,6 +240,8 @@ extern char *power_run_script(char *script_name, char *script_path, ...@@ -240,6 +240,8 @@ extern char *power_run_script(char *script_name, char *script_path,
script_argv[3], script_argv[4], script_argv[5], script_argv[3], script_argv[4], script_argv[5],
script_argv[6], script_argv[7]); script_argv[6], script_argv[7]);
} }
if (data_in)
info("%s: %s", __func__, data_in);
} }
if (script_path[0] != '/') { if (script_path[0] != '/') {
error("%s: %s is not fully qualified pathname (%s)", error("%s: %s is not fully qualified pathname (%s)",
......
...@@ -136,6 +136,7 @@ static char *capmc_path = NULL; ...@@ -136,6 +136,7 @@ static char *capmc_path = NULL;
static uint32_t cap_watts = DEFAULT_CAP_WATTS; static uint32_t cap_watts = DEFAULT_CAP_WATTS;
static uint32_t set_watts = 0; static uint32_t set_watts = 0;
static uint64_t debug_flag = 0; static uint64_t debug_flag = 0;
static char *full_nid_string = NULL;
static uint32_t decrease_rate = DEFAULT_DECREASE_RATE; static uint32_t decrease_rate = DEFAULT_DECREASE_RATE;
static uint32_t increase_rate = DEFAULT_INCREASE_RATE; static uint32_t increase_rate = DEFAULT_INCREASE_RATE;
static uint32_t job_level = NO_VAL; static uint32_t job_level = NO_VAL;
...@@ -153,6 +154,7 @@ static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER; ...@@ -153,6 +154,7 @@ static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t term_cond = PTHREAD_COND_INITIALIZER; static pthread_cond_t term_cond = PTHREAD_COND_INITIALIZER;
/*********************** local functions *********************/ /*********************** local functions *********************/
static void _build_full_nid_string(void);
static void _clear_node_caps(void); static void _clear_node_caps(void);
static void _get_capabilities(void); static void _get_capabilities(void);
static void _get_caps(void); static void _get_caps(void);
...@@ -369,6 +371,7 @@ static void _load_config(void) ...@@ -369,6 +371,7 @@ static void _load_config(void)
} }
xfree(sched_params); xfree(sched_params);
xfree(full_nid_string);
if (debug_flag & DEBUG_FLAG_POWER) { if (debug_flag & DEBUG_FLAG_POWER) {
char *level_str = ""; char *level_str = "";
if (job_level == 0) if (job_level == 0)
...@@ -658,6 +661,45 @@ static void _json_parse_capabilities(json_object *jobj, ...@@ -658,6 +661,45 @@ static void _json_parse_capabilities(json_object *jobj,
} }
} }
static void _build_full_nid_string(void)
{
/* Read nodes */
slurmctld_lock_t read_node_lock = {
NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK };
struct node_record *node_ptr;
hostset_t hs = NULL;
char *sep, *tmp_str;
int i, num_ent = 0;
if (full_nid_string)
return;
lock_slurmctld(read_node_lock);
for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
i++, node_ptr++) {
if (!hs)
hs = hostset_create(_node_name2nid(node_ptr->name));
else
hostset_insert(hs, _node_name2nid(node_ptr->name));
num_ent++;
}
unlock_slurmctld(read_node_lock);
if (!hs) {
error("%s: No nodes found", __func__);
return;
}
tmp_str = xmalloc(node_record_count * 6 + 2);
(void) hostset_ranged_string(hs, num_ent * 6, tmp_str);
hostset_destroy(hs);
if ((sep = strrchr(tmp_str, ']')))
sep[0] = '\0';
if (tmp_str[0] == '[')
full_nid_string = xstrdup(tmp_str + 1);
else
full_nid_string = xstrdup(tmp_str);
xfree(tmp_str);
}
static void _get_caps(void) static void _get_caps(void)
{ {
/* Write nodes */ /* Write nodes */
...@@ -713,7 +755,7 @@ static void _get_caps(void) ...@@ -713,7 +755,7 @@ static void _get_caps(void)
for (i = 0; i < num_ent; i++) { for (i = 0; i < num_ent; i++) {
node_ptr = find_node_record2(ents[i].node_name[0]); node_ptr = find_node_record2(ents[i].node_name[0]);
if (!node_ptr) { if (!node_ptr) {
debug("%s: Node %s not in Slurm config", debug2("%s: Node %s not in Slurm config",
__func__, ents[i].node_name[0]); __func__, ents[i].node_name[0]);
} else { } else {
if (!node_ptr->power) { if (!node_ptr->power) {
...@@ -722,7 +764,7 @@ static void _get_caps(void) ...@@ -722,7 +764,7 @@ static void _get_caps(void)
} }
node_ptr->power->cap_watts = ents[i].cap_watts; node_ptr->power->cap_watts = ents[i].cap_watts;
} }
xfree(ents[i].node_name[0]); xfree(ents[i].node_name[0]); /* FUTURE: array of node names */
xfree(ents[i].node_name); xfree(ents[i].node_name);
} }
xfree(ents); xfree(ents);
...@@ -1020,20 +1062,28 @@ static void _get_node_energy_counter(void) ...@@ -1020,20 +1062,28 @@ static void _get_node_energy_counter(void)
struct node_record *node_ptr; struct node_record *node_ptr;
DEF_TIMERS; DEF_TIMERS;
_build_full_nid_string();
if (!full_nid_string)
return;
script_argv[0] = capmc_path; script_argv[0] = capmc_path;
script_argv[1] = "get_node_energy_counter"; script_argv[1] = "get_node_energy_counter";
script_argv[2] = NULL; script_argv[2] = "--nids";
script_argv[3] = full_nid_string;
script_argv[4] = NULL;
START_TIMER; START_TIMER;
cmd_resp = power_run_script("capmc", capmc_path, script_argv, cmd_resp = power_run_script("capmc", capmc_path, script_argv,
get_timeout, NULL, &status); get_timeout, NULL, &status);
END_TIMER; END_TIMER;
if (status != 0) { if (status != 0) {
error("%s: capmc %s: %s", __func__, script_argv[1], cmd_resp); error("%s: capmc %s %s %s: %s", __func__,
script_argv[1], script_argv[2], script_argv[3], cmd_resp);
xfree(cmd_resp); xfree(cmd_resp);
return; return;
} else if (debug_flag & DEBUG_FLAG_POWER) { } else if (debug_flag & DEBUG_FLAG_POWER) {
info("%s: capmc %s %s", __func__, script_argv[1], TIME_STR); info("%s: capmc %s %s %s %s", __func__,
script_argv[1], script_argv[2], script_argv[3], TIME_STR);
} }
if ((cmd_resp == NULL) || (cmd_resp[0] == '\0')) { if ((cmd_resp == NULL) || (cmd_resp[0] == '\0')) {
xfree(cmd_resp); xfree(cmd_resp);
...@@ -1698,6 +1748,7 @@ extern void fini(void) ...@@ -1698,6 +1748,7 @@ extern void fini(void)
pthread_join(power_thread, NULL); pthread_join(power_thread, NULL);
power_thread = 0; power_thread = 0;
xfree(capmc_path); xfree(capmc_path);
xfree(full_nid_string);
} }
pthread_mutex_unlock(&thread_flag_mutex); pthread_mutex_unlock(&thread_flag_mutex);
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment