Skip to content
Snippets Groups Projects
Commit da29f2dc authored by Moe Jette's avatar Moe Jette
Browse files

Fix logic so that we clear POWER_SAVE flag on nodes on reconfig

  as appropriate
parent ee0214d6
No related branches found
No related tags found
No related merge requests found
...@@ -345,36 +345,38 @@ static int _init_power_config(void) ...@@ -345,36 +345,38 @@ static int _init_power_config(void)
slurm_conf_unlock(); slurm_conf_unlock();
if (idle_time < 0) { /* not an error */ if (idle_time < 0) { /* not an error */
debug("power_save module disabled, idle_time < 0"); debug("power_save module disabled, SuspendTime < 0");
return -1; return -1;
} }
if (suspend_rate < 1) { if (suspend_rate < 1) {
error("power_save module disabled, suspend_rate < 1"); error("power_save module disabled, SuspendRate < 1");
return -1; return -1;
} }
if (resume_rate < 1) { if (resume_rate < 1) {
error("power_save module disabled, resume_rate < 1"); error("power_save module disabled, ResumeRate < 1");
return -1; return -1;
} }
if (suspend_prog == NULL) if (suspend_prog == NULL) {
info("WARNING: power_save module has NULL suspend program"); error("power_save module disabled, NULL SuspendProgram");
else if (!_valid_prog(suspend_prog)) { return -1;
error("power_save module disabled, invalid suspend program %s", } else if (!_valid_prog(suspend_prog)) {
error("power_save module disabled, invalid SuspendProgram %s",
suspend_prog); suspend_prog);
return -1; return -1;
} }
if (resume_prog == NULL) if (resume_prog == NULL) {
info("WARNING: power_save module has NULL resume program"); error("power_save module disabled, NULL ResumeProgram");
else if (!_valid_prog(resume_prog)) { return -1;
error("power_save module disabled, invalid resume program %s", } else if (!_valid_prog(resume_prog)) {
error("power_save module disabled, invalid ResumeProgram %s",
resume_prog); resume_prog);
return -1; return -1;
} }
if (exc_nodes if (exc_nodes &&
&& (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) { (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) {
error("power_save module disabled, " error("power_save module disabled, "
"invalid excluded nodes %s", exc_nodes); "invalid SuspendExcNodes %s", exc_nodes);
return -1; return -1;
} }
...@@ -389,7 +391,7 @@ static int _init_power_config(void) ...@@ -389,7 +391,7 @@ static int _init_power_config(void)
part_ptr = find_part_record(one_part); part_ptr = find_part_record(one_part);
if (!part_ptr) { if (!part_ptr) {
error("power_save module disabled, " error("power_save module disabled, "
"invalid excluded partition %s", "invalid SuspendExcPart %s",
one_part); one_part);
rc = -1; rc = -1;
break; break;
...@@ -468,8 +470,11 @@ extern void *init_power_save(void *arg) ...@@ -468,8 +470,11 @@ extern void *init_power_save(void *arg)
} }
if ((last_config != slurmctld_conf.last_update) && if ((last_config != slurmctld_conf.last_update) &&
(_init_power_config())) (_init_power_config())) {
info("power_save mode has been disabled due to "
"configuration changes");
goto fini; goto fini;
}
/* Only run every 60 seconds or after /* Only run every 60 seconds or after
* a node state change, whichever * a node state change, whichever
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
* any later version. * any later version.
* *
* In addition, as a special exception, the copyright holders give permission * In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under * to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and * certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU * distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than * General Public License in all respects for all of the code used other than
...@@ -840,10 +840,10 @@ int read_slurm_conf(int recover) ...@@ -840,10 +840,10 @@ int read_slurm_conf(int recover)
_build_bitmaps_pre_select(); _build_bitmaps_pre_select();
if ((select_g_node_init(node_record_table_ptr, node_record_count) if ((select_g_node_init(node_record_table_ptr, node_record_count)
!= SLURM_SUCCESS) != SLURM_SUCCESS) ||
|| (select_g_block_init(part_list) != SLURM_SUCCESS) (select_g_block_init(part_list) != SLURM_SUCCESS) ||
|| (select_g_state_restore(state_save_dir) != SLURM_SUCCESS) (select_g_state_restore(state_save_dir) != SLURM_SUCCESS) ||
|| (select_g_job_init(job_list) != SLURM_SUCCESS)) { (select_g_job_init(job_list) != SLURM_SUCCESS)) {
fatal("failed to initialize node selection plugin state, " fatal("failed to initialize node selection plugin state, "
"Clean start required."); "Clean start required.");
} }
...@@ -909,10 +909,17 @@ int read_slurm_conf(int recover) ...@@ -909,10 +909,17 @@ int read_slurm_conf(int recover)
/* Restore node state and size information from saved records. /* Restore node state and size information from saved records.
* If a node was re-configured to be down or drained, we set those states */ * If a node was re-configured to be down or drained, we set those states */
static int _restore_node_state(struct node_record *old_node_table_ptr, static int _restore_node_state(struct node_record *old_node_table_ptr,
int old_node_record_count) int old_node_record_count)
{ {
struct node_record *node_ptr; struct node_record *node_ptr;
int i, rc = SLURM_SUCCESS; int i, rc = SLURM_SUCCESS;
hostset_t hs = NULL;
slurm_ctl_conf_t *conf = slurm_conf_lock();
bool power_save_mode = false;
if (conf->suspend_program && conf->resume_program)
power_save_mode = true;
slurm_conf_unlock();
for (i = 0; i < old_node_record_count; i++) { for (i = 0; i < old_node_record_count; i++) {
uint16_t drain_flag = false, down_flag = false; uint16_t drain_flag = false, down_flag = false;
...@@ -920,7 +927,8 @@ static int _restore_node_state(struct node_record *old_node_table_ptr, ...@@ -920,7 +927,8 @@ static int _restore_node_state(struct node_record *old_node_table_ptr,
if (node_ptr == NULL) if (node_ptr == NULL)
continue; continue;
if ((node_ptr->node_state & NODE_STATE_BASE) == NODE_STATE_DOWN) if ((node_ptr->node_state & NODE_STATE_BASE) ==
NODE_STATE_DOWN)
down_flag = true; down_flag = true;
if (node_ptr->node_state & NODE_STATE_DRAIN) if (node_ptr->node_state & NODE_STATE_DRAIN)
drain_flag = true; drain_flag = true;
...@@ -931,7 +939,15 @@ static int _restore_node_state(struct node_record *old_node_table_ptr, ...@@ -931,7 +939,15 @@ static int _restore_node_state(struct node_record *old_node_table_ptr,
} }
if (drain_flag) if (drain_flag)
node_ptr->node_state |= NODE_STATE_DRAIN; node_ptr->node_state |= NODE_STATE_DRAIN;
if ((node_ptr->node_state & NODE_STATE_POWER_SAVE) &&
(!power_save_mode)) {
node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
if (hs)
hostset_insert(hs, node_ptr->name);
else
hs = hostset_create(node_ptr->name);
}
node_ptr->last_response = old_node_table_ptr[i].last_response; node_ptr->last_response = old_node_table_ptr[i].last_response;
if (old_node_table_ptr[i].port != node_ptr->config_ptr->cpus) { if (old_node_table_ptr[i].port != node_ptr->config_ptr->cpus) {
rc = ESLURM_NEED_RESTART; rc = ESLURM_NEED_RESTART;
...@@ -966,6 +982,13 @@ static int _restore_node_state(struct node_record *old_node_table_ptr, ...@@ -966,6 +982,13 @@ static int _restore_node_state(struct node_record *old_node_table_ptr,
old_node_table_ptr[i].os = NULL; old_node_table_ptr[i].os = NULL;
} }
} }
if (hs) {
char node_names[128];
hostset_ranged_string(hs, sizeof(node_names), node_names);
info("Cleared POWER_SAVE flag from nodes %s", node_names);
hostset_destroy(hs);
}
return rc; return rc;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment