diff --git a/NEWS b/NEWS index 226028fbd409d25ca39fbce94255c741c0b29215..c184daf0bfa31fd9ed461ac879593dc9295e3218 100644 --- a/NEWS +++ b/NEWS @@ -20,6 +20,8 @@ documents those changes that are of interest to users and admins. assume control immediately. Patch from Matthieu Hautreux, CEA. -- If srun is unable to communicate with the slurmd tasks are now marked as failed with the controller. + -- Clear node's POWER_SAVE flag if configuration changes to one lacking a + ResumeProgram. * Changes in SLURM 2.0.0-rc1 ============================== diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 0730d9e34164feac9d92a2f6bb47f4cbf9a659ad..5102ff8e15c9a61e1cabaca6656f88de2e740ca2 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -424,6 +424,13 @@ extern int load_all_node_state ( bool state_only ) time_t time_stamp, now = time(NULL); Buf buffer; char *ver_str = NULL; + hostset_t hs = NULL; + slurm_ctl_conf_t *conf = slurm_conf_lock(); + bool power_save_mode = false; + + if (conf->resume_program && conf->resume_program[0]) + power_save_mode = true; + slurm_conf_unlock(); /* read the file */ state_file = xstrdup (slurmctld_conf.state_save_location); @@ -529,8 +536,14 @@ extern int load_all_node_state ( bool state_only ) if (node_state & NODE_STATE_FAIL) node_ptr->node_state |= NODE_STATE_FAIL; - if (node_state & NODE_STATE_POWER_SAVE) - node_ptr->node_state = node_state; + if (node_state & NODE_STATE_POWER_SAVE) { + if (power_save_mode) + node_ptr->node_state=node_state; + else if (hs) + hostset_insert(hs, node_name); + else + hs = hostset_create(node_name); + } } if (node_ptr->reason == NULL) node_ptr->reason = reason; @@ -540,6 +553,14 @@ extern int load_all_node_state ( bool state_only ) node_ptr->features = features; } else { node_cnt++; + if ((node_state & NODE_STATE_POWER_SAVE) && + (!power_save_mode)) { + node_state &= (~NODE_STATE_POWER_SAVE); + if (hs) + hostset_insert(hs, node_name); + else + hs = hostset_create(node_name); + } node_ptr->node_state = node_state; xfree(node_ptr->reason); node_ptr->reason = reason; @@ -559,15 +580,20 @@ extern int load_all_node_state ( bool state_only ) xfree (node_name); } - info ("Recovered state of %d nodes", node_cnt); +fini: info("Recovered state of %d nodes", node_cnt); + if (hs) { + char node_names[128]; + hostset_ranged_string(hs, sizeof(node_names), node_names); + info("Cleared POWER_SAVE flag from nodes %s", node_names); + hostset_destroy(hs); + } free_buf (buffer); return error_code; unpack_error: - error ("Incomplete node data checkpoint file"); - info("Recovered state of %d nodes", node_cnt); - free_buf (buffer); - return EFAULT; + error("Incomplete node data checkpoint file"); + error_code = EFAULT; + goto fini; } /* diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 14dbfecc08bf949dc6b8fa6350aff7640f56ce46..8cc32ee7a785644ddba102d55c13a4053810ba9a 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -166,12 +166,12 @@ typedef struct slurmctld_config { } slurmctld_config_t; extern slurmctld_config_t slurmctld_config; -extern int bg_recover; /* state recovery mode */ -extern char *slurmctld_cluster_name; /* name of cluster */ +extern int bg_recover; /* state recovery mode */ +extern char *slurmctld_cluster_name; /* name of cluster */ extern void *acct_db_conn; -extern int accounting_enforce; -extern int association_based_accounting; -extern int cluster_procs; +extern int accounting_enforce; +extern int association_based_accounting; +extern int cluster_procs; /*****************************************************************************\ * NODE parameters and data structures