From 63fdb35434a5cb2b69fa90002948b3c6799818c7 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 25 Jun 2003 20:20:30 +0000 Subject: [PATCH] Minor changes in how COMPLETING jobs and their nodes are handled. If a node is down and not responding, don't bother to send a KILL_JOB RPC to it. If that is the only node associated with a job, don't have that job go through a COMPLETING state. It goes directly to a COMPLETED state. Also preserve the NO_RESPOND flag associated with a node if its state is changed via user request (e.g. scontrol). --- src/slurmctld/node_mgr.c | 23 ++++++++++++----------- src/slurmctld/node_scheduler.c | 6 +++++- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 730d84d8398..c93a1493535 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -912,8 +912,9 @@ int update_node ( update_node_msg_t * update_node_msg ) { int error_code = 0, state_val, node_inx; char *this_node_name ; - struct node_record *node_record_point; + struct node_record *node_ptr; hostlist_t host_list; + uint16_t no_resp_flag = 0; if (update_node_msg -> node_names == NULL ) { error ("update_node: invalid node name %s", @@ -932,9 +933,9 @@ int update_node ( update_node_msg_t * update_node_msg ) last_node_update = time (NULL); while ( (this_node_name = hostlist_shift (host_list)) ) { - node_record_point = find_node_record (this_node_name); - node_inx = node_record_point - node_record_table_ptr; - if (node_record_point == NULL) { + node_ptr = find_node_record (this_node_name); + node_inx = node_ptr - node_record_table_ptr; + if (node_ptr == NULL) { error ("update_node: node %s does not exist", this_node_name); error_code = ESLURM_INVALID_NODE_NAME; @@ -944,8 +945,8 @@ int update_node ( update_node_msg_t * update_node_msg ) if (state_val != NO_VAL) { if (state_val == NODE_STATE_DOWN) { - bit_clear (up_node_bitmap, node_inx); - bit_clear (idle_node_bitmap, node_inx); + /* We must set node down before killing its jobs */ + _make_node_down(node_ptr); kill_running_job_by_node_name (this_node_name, false); } @@ -977,8 +978,7 @@ int update_node ( update_node_msg_t * update_node_msg ) else if (state_val == NODE_STATE_NO_RESPOND) { bit_clear (up_node_bitmap, node_inx); bit_clear (idle_node_bitmap, node_inx); - node_record_point->node_state |= - NODE_STATE_NO_RESPOND; + node_ptr->node_state |= NODE_STATE_NO_RESPOND; info ("update_node: node %s state set to %s", this_node_name, "NoResp"); continue; @@ -989,7 +989,8 @@ int update_node ( update_node_msg_t * update_node_msg ) continue; } - node_record_point->node_state = state_val; + no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND; + node_ptr->node_state = state_val | no_resp_flag; info ("update_node: node %s state set to %s", this_node_name, node_state_string(state_val)); } @@ -1209,8 +1210,8 @@ void set_node_down (char *name) return; } - (void) kill_running_job_by_node_name(name, false); _make_node_down(node_ptr); + (void) kill_running_job_by_node_name(name, false); return; } @@ -1278,9 +1279,9 @@ void ping_nodes (void) (base_state != NODE_STATE_DOWN)) { error ("Node %s not responding, setting DOWN", node_record_table_ptr[i].name); + _make_node_down(&node_record_table_ptr[i]); kill_running_job_by_node_name ( node_record_table_ptr[i].name, false); - _make_node_down(&node_record_table_ptr[i]); continue; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 471bfec42c9..eec6ed24ed5 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -126,7 +126,7 @@ int count_cpus(unsigned *bitmap) /* * deallocate_nodes - for a given job, deallocate its nodes and make * their state NODE_STATE_COMPLETING - * IN job_ptr - pointer to terminating job + * IN job_ptr - pointer to terminating job (already in some COMPLETING state) * IN timeout - true of job exhausted time limit, send REQUEST_KILL_TIMELIMIT * RPC instead of REQUEST_KILL_JOB * globals: node_record_count - number of nodes in the system @@ -159,6 +159,9 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout) for (i = 0; i < node_record_count; i++) { if (bit_test(job_ptr->node_bitmap, i) == 0) continue; + if (node_record_table_ptr[i].node_state == + (NODE_STATE_DOWN | NODE_STATE_NO_RESPOND)) + continue; /* don't bother with dead nodes */ if ((agent_args->node_count + 1) > buf_rec_size) { buf_rec_size += 32; xrealloc((agent_args->slurm_addr), @@ -180,6 +183,7 @@ void deallocate_nodes(struct job_record *job_ptr, bool timeout) error("Job %u allocated no nodes to be killed on", job_ptr->job_id); xfree(agent_args); + job_ptr->job_state &= (~JOB_COMPLETING); return; } -- GitLab