diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index ddfdc5daf1862401834dbab5c5268e22c1ae7588..10fd556809fa66953ff20d8f25fe90ac904385c7 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1485,7 +1485,7 @@ void make_node_comp(struct node_record *node_ptr) base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND); no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND; - if ((base_state != NODE_STATE_DOWN) && (!no_resp_flag)) + if (base_state != NODE_STATE_DOWN) (node_ptr->comp_job_cnt)++; /* Don't verify RPC */ if ((base_state == NODE_STATE_DRAINING) && @@ -1502,13 +1502,9 @@ void make_node_comp(struct node_record *node_ptr) node_ptr->name, node_state_string((enum node_states) node_ptr->node_state)); - } else if (!no_resp_flag) { + } else { node_ptr->node_state = NODE_STATE_COMPLETING | no_resp_flag; xfree(node_ptr->reason); - } else if ( (base_state == NODE_STATE_ALLOCATED) && - (node_ptr->run_job_cnt == 0) ) { - bit_set(idle_node_bitmap, inx); - node_ptr->node_state = NODE_STATE_IDLE | no_resp_flag; } } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index cb287f2585f8704f527bbef2d673d0adca435cbe..d0bf69b30115b4783c2bb0646723ec6c9b75b070 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -170,7 +170,7 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout) continue; base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND); no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND; - if ((base_state == NODE_STATE_DOWN) || no_resp_flag) { + if (base_state == NODE_STATE_DOWN) { /* Issue the KILL RPC, but don't verify response */ down_node_cnt++; bit_clear(job_ptr->node_bitmap, i); @@ -1319,6 +1319,18 @@ extern void re_kill_job(struct job_record *job_ptr) struct node_record *node_ptr = &node_record_table_ptr[i]; if (bit_test(job_ptr->node_bitmap, i) == 0) continue; + if ((node_ptr->node_state & (~NODE_STATE_NO_RESPOND)) + == NODE_STATE_DOWN) { + /* Consider job already completed */ + bit_clear(job_ptr->node_bitmap, i); + if (node_ptr->comp_job_cnt) + (node_ptr->comp_job_cnt)--; + if ((--job_ptr->node_cnt) == 0) { + delete_all_step_records(job_ptr); + job_ptr->job_state &= (~JOB_COMPLETING); + } + continue; + } if (node_ptr->node_state & NODE_STATE_NO_RESPOND) continue; (void) hostlist_push_host(kill_hostlist, node_ptr->name);