diff --git a/NEWS b/NEWS index ae7a3cbdcb2ea547341b25de34dd02ee10482536..83b93b6fb7efd4395b76f6677b075d5615f677ad 100644 --- a/NEWS +++ b/NEWS @@ -192,6 +192,9 @@ documents those changes that are of interest to users and admins. -- Reword the explanation of cputime and cputimeraw in the sacct man page. -- JobCompType allows "jobcomp/mysql" as valid name but the code used "job_comp/mysql" setting an incorrect default database. + -- Try to load libslurm.so only when necessary. + -- When nodes scheduled for reboot, set state to DOWN rather than FUTURE so + they are still visible to sinfo. State set to IDLE after reboot completes. * Changes in Slurm 14.03.6 ========================== diff --git a/contribs/pam/pam_slurm.c b/contribs/pam/pam_slurm.c index ffa0620c2618f7135b0e33a4dd6578a323d9bb07..e01c928c723a5604b86b24d41790a2d6df27a6fd 100644 --- a/contribs/pam/pam_slurm.c +++ b/contribs/pam/pam_slurm.c @@ -422,7 +422,9 @@ extern void libpam_slurm_init (void) SLURM_API_REVISION, SLURM_API_AGE) >= sizeof(libslurmname) ) { _log_msg (LOG_ERR, "Unable to write libslurmname\n"); - } else if (!(slurm_h = dlopen(libslurmname, RTLD_NOW|RTLD_GLOBAL))) { + } else if ((slurm_h = dlopen(libslurmname, RTLD_NOW|RTLD_GLOBAL))) { + return; + } else { _log_msg (LOG_INFO, "Unable to dlopen %s: %s\n", libslurmname, dlerror ()); } @@ -430,7 +432,9 @@ extern void libpam_slurm_init (void) if (snprintf(libslurmname, sizeof(libslurmname), "libslurm.so.%d", SLURM_API_CURRENT) >= sizeof(libslurmname) ) { _log_msg (LOG_ERR, "Unable to write libslurmname\n"); - } else if (!(slurm_h = dlopen(libslurmname, RTLD_NOW|RTLD_GLOBAL))) { + } else if ((slurm_h = dlopen(libslurmname, RTLD_NOW|RTLD_GLOBAL))) { + return; + } else { _log_msg (LOG_INFO, "Unable to dlopen %s: %s\n", libslurmname, dlerror ()); } diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 91abde3cf63dcd32f8c8c479cf85246d6d24f532..a51c90509502a4319c6d2822b9f2c8d2346fdc8b 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1324,9 +1324,10 @@ static void _queue_reboot_msg(void) } hostlist_push(reboot_agent_args->hostlist, node_ptr->name); reboot_agent_args->node_count++; - node_ptr->node_state = NODE_STATE_FUTURE | - (node_ptr->node_state & NODE_STATE_FLAGS); node_ptr->node_state &= ~NODE_STATE_MAINT; + node_ptr->node_state &= NODE_STATE_FLAGS; + node_ptr->node_state |= NODE_STATE_DOWN; + node_ptr->reason = xstrdup("Scheduled reboot"); bit_clear(avail_node_bitmap, i); bit_clear(idle_node_bitmap, i); node_ptr->last_response = now; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 8b6d59ee0a10c8f2e65348d122add4b3cc176343..29749c58c53e3bada450f0943886437b6c13e978 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -742,8 +742,7 @@ extern void pack_all_node (char **buffer_ptr, int *buffer_size, if (((show_flags & SHOW_ALL) == 0) && (uid != 0) && (_node_is_hidden(node_ptr))) hidden = true; - else if (IS_NODE_FUTURE(node_ptr) && - !IS_NODE_MAINT(node_ptr)) /* reboot req sent */ + else if (IS_NODE_FUTURE(node_ptr)) hidden = true; else if (IS_NODE_CLOUD(node_ptr) && IS_NODE_POWER_SAVE(node_ptr)) @@ -830,8 +829,7 @@ extern void pack_one_node (char **buffer_ptr, int *buffer_size, if (((show_flags & SHOW_ALL) == 0) && (uid != 0) && (_node_is_hidden(node_ptr))) hidden = true; - else if (IS_NODE_FUTURE(node_ptr) && - !IS_NODE_MAINT(node_ptr)) /* reboot req sent */ + else if (IS_NODE_FUTURE(node_ptr)) hidden = true; else if (IS_NODE_CLOUD(node_ptr) && IS_NODE_POWER_SAVE(node_ptr)) @@ -2203,10 +2201,9 @@ extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg, } } else if (IS_NODE_DOWN(node_ptr) && ((slurmctld_conf.ret2service == 2) || + !xstrcmp(node_ptr->reason, "Scheduled reboot") || ((slurmctld_conf.ret2service == 1) && - (node_ptr->reason != NULL) && - (strncmp(node_ptr->reason, "Not responding", 14) - == 0)))) { + !xstrcmp(node_ptr->reason, "Not responding")))) { if (reg_msg->job_count) { node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags; @@ -2564,10 +2561,11 @@ extern int validate_nodes_via_front_end( } } else if (IS_NODE_DOWN(node_ptr) && ((slurmctld_conf.ret2service == 2) || + !xstrcmp(node_ptr->reason, + "Scheduled reboot") || ((slurmctld_conf.ret2service == 1) && - (node_ptr->reason != NULL) && - (strncmp(node_ptr->reason, - "Not responding", 14) == 0)))) { + !xstrcmp(node_ptr->reason, + "Not responding")))) { update_node_state = true; *newly_up = true; if (node_ptr->run_job_cnt) { @@ -2679,9 +2677,10 @@ static void _node_did_resp(front_end_record_t *fe_ptr) fe_ptr->node_state = NODE_STATE_IDLE | node_flags; } if (IS_NODE_DOWN(fe_ptr) && - (slurmctld_conf.ret2service == 1) && - (fe_ptr->reason != NULL) && - (strncmp(fe_ptr->reason, "Not responding", 14) == 0)) { + ((slurmctld_conf.ret2service == 2) || + !xstrcmp(fe_ptr->reason, "Scheduled reboot") || + ((slurmctld_conf.ret2service == 1) && + !xstrcmp(fe_ptr->reason, "Not responding")))) { last_front_end_update = now; fe_ptr->node_state = NODE_STATE_IDLE | node_flags; info("node_did_resp: node %s returned to service", @@ -2730,9 +2729,10 @@ static void _node_did_resp(struct node_record *node_ptr) } } if (IS_NODE_DOWN(node_ptr) && - (slurmctld_conf.ret2service == 1) && - (node_ptr->reason != NULL) && - (strncmp(node_ptr->reason, "Not responding", 14) == 0)) { + ((slurmctld_conf.ret2service == 2) || + !xstrcmp(node_ptr->reason, "Scheduled reboot") || + ((slurmctld_conf.ret2service == 1) && + !xstrcmp(node_ptr->reason, "Not responding")))) { node_ptr->last_idle = now; node_ptr->node_state = NODE_STATE_IDLE | node_flags; info("node_did_resp: node %s returned to service",