diff --git a/NEWS b/NEWS index 28ab1113f30a753657e0be27fdea7ddf18cd0d05..72aace6b856fc70fa394f11ce751c9d3916d8815 100644 --- a/NEWS +++ b/NEWS @@ -290,6 +290,9 @@ documents those changes that are of interest to users and administrators. association GrpWall limit. -- Squeue always loads new data if user_id option specified -- Fix for possible job ID parsing failure and abort. + -- If node boot in progress when slurmctld daemon is restarted, then allow + sufficient time for reboot to complete and not prematurely DOWN the node as + "Not responding". * Changes in Slurm 16.05.9 ========================== diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 9e063df76b337c9b6ab08b080450891266347840..34ba315b70b4d4d3d55e4fd6b4f2f635bc70991b 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -676,6 +676,14 @@ extern int load_all_node_state ( bool state_only ) if (node_ptr) { node_cnt++; + if (node_ptr->node_state & NODE_STATE_POWER_UP) { + /* last_response value not saved, + * make best guess */ + node_ptr->last_response = now + + slurmctld_conf.resume_timeout; + } else + node_ptr->last_response = (time_t) 0; + if (obj_protocol_version && (obj_protocol_version != (uint16_t)NO_VAL)) node_ptr->protocol_version =