Skip to content
Snippets Groups Projects
Commit d99cf552 authored by Morris Jette's avatar Morris Jette
Browse files

Avoid duplicate PowerUp of node on slurmctld start

This fixes a race condition if the slurmctld needed to power up
a node shortly after startup. Previously it would execute the
ResumeProgram twice for effected nodes.
parent 13023913
No related branches found
No related tags found
No related merge requests found
...@@ -90,7 +90,8 @@ char *exc_nodes = NULL, *exc_parts = NULL; ...@@ -90,7 +90,8 @@ char *exc_nodes = NULL, *exc_parts = NULL;
time_t last_config = (time_t) 0, last_suspend = (time_t) 0; time_t last_config = (time_t) 0, last_suspend = (time_t) 0;
uint16_t slurmd_timeout; uint16_t slurmd_timeout;
bitstr_t *exc_node_bitmap = NULL, *suspend_node_bitmap = NULL; bitstr_t *exc_node_bitmap = NULL;
bitstr_t *suspend_node_bitmap = NULL, *resume_node_bitmap = NULL;
int suspend_cnt, resume_cnt; int suspend_cnt, resume_cnt;
float suspend_cnt_f, resume_cnt_f; float suspend_cnt_f, resume_cnt_f;
...@@ -137,6 +138,8 @@ static void _do_power_work(time_t now) ...@@ -137,6 +138,8 @@ static void _do_power_work(time_t now)
if (last_suspend) { if (last_suspend) {
bit_nclear(suspend_node_bitmap, 0, bit_nclear(suspend_node_bitmap, 0,
(node_record_count - 1)); (node_record_count - 1));
bit_nclear(resume_node_bitmap, 0,
(node_record_count - 1));
last_suspend = (time_t) 0; last_suspend = (time_t) 0;
} }
} }
...@@ -170,7 +173,8 @@ static void _do_power_work(time_t now) ...@@ -170,7 +173,8 @@ static void _do_power_work(time_t now)
bit_clear(power_node_bitmap, i); bit_clear(power_node_bitmap, i);
bit_clear(avail_node_bitmap, i); bit_clear(avail_node_bitmap, i);
node_ptr->last_response = now + resume_timeout; node_ptr->last_response = now + resume_timeout;
bit_set(wake_node_bitmap, i); bit_set(wake_node_bitmap, i);
bit_set(resume_node_bitmap, i);
} }
/* Suspend nodes as appropriate */ /* Suspend nodes as appropriate */
...@@ -247,7 +251,8 @@ static void _re_wake(void) ...@@ -247,7 +251,8 @@ static void _re_wake(void)
if (IS_NODE_ALLOCATED(node_ptr) && if (IS_NODE_ALLOCATED(node_ptr) &&
IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_NO_RESPOND(node_ptr) &&
!IS_NODE_POWER_SAVE(node_ptr) && !IS_NODE_POWER_SAVE(node_ptr) &&
(bit_test(suspend_node_bitmap, i) == 0)) { (bit_test(suspend_node_bitmap, i) == 0) &&
(bit_test(resume_node_bitmap, i) == 0)) {
if (wake_node_bitmap == NULL) { if (wake_node_bitmap == NULL) {
wake_node_bitmap = wake_node_bitmap =
bit_alloc(node_record_count); bit_alloc(node_record_count);
...@@ -620,6 +625,7 @@ static void *_init_power_save(void *arg) ...@@ -620,6 +625,7 @@ static void *_init_power_save(void *arg)
goto fini; goto fini;
suspend_node_bitmap = bit_alloc(node_record_count); suspend_node_bitmap = bit_alloc(node_record_count);
resume_node_bitmap = bit_alloc(node_record_count);
while (slurmctld_config.shutdown_time == 0) { while (slurmctld_config.shutdown_time == 0) {
sleep(1); sleep(1);
...@@ -663,6 +669,7 @@ static void *_init_power_save(void *arg) ...@@ -663,6 +669,7 @@ static void *_init_power_save(void *arg)
fini: _clear_power_config(); fini: _clear_power_config();
FREE_NULL_BITMAP(suspend_node_bitmap); FREE_NULL_BITMAP(suspend_node_bitmap);
FREE_NULL_BITMAP(resume_node_bitmap);
_shutdown_power(); _shutdown_power();
slurm_mutex_lock(&power_mutex); slurm_mutex_lock(&power_mutex);
power_save_enabled = false; power_save_enabled = false;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment