diff --git a/NEWS b/NEWS index a44959fd16c151bfb16f6b9f762c6e76100c6c82..6052af6db816a3fd592c440caa20f9f236172537 100644 --- a/NEWS +++ b/NEWS @@ -26,6 +26,8 @@ documents those changes that are of interest to users and admins. -- scancel of a job step will now send a job-step-completed message to the controller after verifying that the step has completed on all nodes. -- Fix task layout bug in srun. + -- Added times to node "Reason" field when set down for insufficient + resources or if not responding. * Changes in SLURM 1.0.1 ======================== diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index c1819abd2194effb1ad1124092390bbc8ff1187c..83c68de1599803beae03f98099e46b21eb3855db 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -890,6 +890,8 @@ int update_node ( update_node_msg_t * update_node_msg ) false); } else if (state_val == NODE_STATE_IDLE) { + /* assume they want to clear DRAIN flag too */ + node_ptr->node_state &= (~NODE_STATE_DRAIN); bit_set (avail_node_bitmap, node_inx); bit_set (idle_node_bitmap, node_inx); reset_job_priority(); @@ -1455,7 +1457,7 @@ static void _node_did_resp(struct node_record *node_ptr) if ((base_state == NODE_STATE_DOWN) && (slurmctld_conf.ret2service == 1) && (node_ptr->reason != NULL) && - (strcmp(node_ptr->reason, "Not responding") == 0)) { + (strncmp(node_ptr->reason, "Not responding", 14) == 0)) { last_node_update = time (NULL); node_ptr->node_state = NODE_STATE_IDLE | node_flags; info("node_did_resp: node %s returned to service", @@ -1546,8 +1548,18 @@ void set_node_down (char *name, char *reason) _make_node_down(node_ptr); (void) kill_running_job_by_node_name(name, false); - if (node_ptr->reason == NULL) - node_ptr->reason = xstrdup(reason); + if (node_ptr->reason == NULL) { + time_t now; + struct tm *time_ptr; + char time_buf[64]; + + now = time (NULL); + time_ptr = localtime(&now); + strftime(time_buf, sizeof(time_buf), " [slurm@%b %d %H:%M]", + time_ptr); + node_ptr->reason = xstrdup(reason); + xstrcat(node_ptr->reason, time_buf); + } return; }