From fc65aaf3f36221fd4616716665b26e274aaabd1f Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 23 Jan 2003 22:54:11 +0000 Subject: [PATCH] Make nodes DOWN and log event on slurmd prolog/epilog failure. --- src/slurmctld/agent.c | 56 ++++++++++++++++++++++------------ src/slurmctld/node_mgr.c | 26 ++++++++++++++++ src/slurmctld/node_scheduler.c | 4 ++- src/slurmctld/slurmctld.h | 4 +++ 4 files changed, 70 insertions(+), 20 deletions(-) diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 7e0707ed85e..2829f20ef75 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -73,7 +73,8 @@ # define WDOG_POLL 2 /* secs */ #endif -typedef enum { DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_FAILED } state_t; +typedef enum { DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_NO_RESP, + DSH_FAILED } state_t; typedef struct thd { pthread_t thread; /* thread ID */ @@ -296,7 +297,8 @@ static int _valid_agent_arg(agent_arg_t *agent_arg_ptr) */ static void *_wdog(void *args) { - int i, fail_cnt, work_done, delay, max_delay = 0; + int fail_cnt, no_resp_cnt, work_done; + int i, delay, max_delay = 0; agent_info_t *agent_ptr = (agent_info_t *) args; thd_t *thread_ptr = agent_ptr->thread_struct; #if AGENT_IS_THREAD @@ -309,8 +311,10 @@ static void *_wdog(void *args) #endif while (1) { - work_done = 1; /* assume all threads complete for now */ - fail_cnt = 0; /* assume all threads complete sucessfully for now */ + work_done = 1; /* assume all threads complete */ + fail_cnt = 0; /* assume no threads failures */ + no_resp_cnt = 0; /* assume all threads respond */ + sleep(WDOG_POLL); slurm_mutex_lock(&agent_ptr->thread_mutex); @@ -332,6 +336,9 @@ static void *_wdog(void *args) max_delay = (int) thread_ptr[i].time; break; + case DSH_NO_RESP: + no_resp_cnt++; + break; case DSH_FAILED: fail_cnt++; break; @@ -343,12 +350,12 @@ static void *_wdog(void *args) } /* Notify slurmctld of non-responding nodes */ - if (fail_cnt) { + if (no_resp_cnt) { #if AGENT_IS_THREAD /* Update node table data for non-responding nodes */ lock_slurmctld(node_write_lock); for (i = 0; i < agent_ptr->thread_count; i++) { - if (thread_ptr[i].state == DSH_FAILED) + if (thread_ptr[i].state == DSH_NO_RESP) node_not_resp(thread_ptr[i].node_name); } unlock_slurmctld(node_write_lock); @@ -358,7 +365,7 @@ static void *_wdog(void *args) slurm_names = xmalloc(fail_cnt * MAX_NAME_LEN); fail_cnt = 0; for (i = 0; i < agent_ptr->thread_count; i++) { - if (thread_ptr[i].state == DSH_FAILED) { + if (thread_ptr[i].state == DSH_NO_RESP) { strncpy(&slurm_names [MAX_NAME_LEN * fail_cnt], thread_ptr[i].node_name, @@ -383,6 +390,8 @@ static void *_wdog(void *args) /* Update last_response on responding nodes */ lock_slurmctld(node_write_lock); for (i = 0; i < agent_ptr->thread_count; i++) { + if (thread_ptr[i].state == DSH_FAILED) + set_node_down(thread_ptr[i].node_name); if (thread_ptr[i].state == DSH_DONE) node_did_resp(thread_ptr[i].node_name); } @@ -390,16 +399,17 @@ static void *_wdog(void *args) #else /* Build a list of all responding nodes and send it to slurmctld to * update time stamps */ - done_cnt = agent_ptr->thread_count - fail_cnt; + done_cnt = agent_ptr->thread_count - fail_cnt - no_resp_cnt; slurm_names = xmalloc(done_cnt * MAX_NAME_LEN); done_cnt = 0; for (i = 0; i < agent_ptr->thread_count; i++) { - if (thread_ptr[i].state == DSH_DONE) { + if (thread_ptr[i].state == DSH_DONE) strncpy(&slurm_names[MAX_NAME_LEN * done_cnt], thread_ptr[i].node_name, MAX_NAME_LEN); done_cnt++; } } + /* need support for node failures here too */ /* send RPC */ fatal("Code development needed here if agent is not thread"); @@ -428,7 +438,7 @@ static void *_thread_per_node_rpc(void *args) return_code_msg_t *slurm_rc_msg; task_info_t *task_ptr = (task_info_t *) args; thd_t *thread_ptr = task_ptr->thread_struct_ptr; - state_t thread_state = DSH_FAILED; + state_t thread_state = DSH_NO_RESP; sigset_t set; /* set up SIGALRM handler */ @@ -494,19 +504,27 @@ static void *_thread_per_node_rpc(void *args) slurm_rc_msg = (return_code_msg_t *) response_msg->data; rc = slurm_rc_msg->return_code; slurm_free_return_code_msg(slurm_rc_msg); - if (rc) - error("_thread_per_node_rpc/rc error from host %s: %s", + if (rc == 0) { + debug3("agent processed RPC to node %s", + thread_ptr->node_name); + thread_state = DSH_DONE; + } else if (rc == ESLURMD_EPILOG_FAILED) { + error("Epilog failure on host %s, setting DOWN", + thread_ptr->node_name); + thread_state = DSH_FAILED; + } else if (rc == ESLURMD_PROLOG_FAILED) { + error("Prolog failure on host %s, setting DOWN", + thread_ptr->node_name); + thread_state = DSH_FAILED; + } else { + error("agent error from host %s: %s", thread_ptr->node_name, slurm_strerror(rc)); /* Don't use %m */ - else { - debug3 - ("agent sucessfully processed RPC to node %s", - thread_ptr->node_name); + thread_state = DSH_DONE; } - thread_state = DSH_DONE; break; default: - error("_thread_per_node_rpc from host %s, bad msg_type %d", + error("agent reply from host %s, bad msg_type %d", thread_ptr->node_name, response_msg->msg_type); break; } @@ -578,7 +596,7 @@ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count) j = 0; for (i = 0; i < agent_info_ptr->thread_count; i++) { - if (thread_ptr[i].state != DSH_FAILED) + if (thread_ptr[i].state != DSH_NO_RESP) continue; agent_arg_ptr->slurm_addr[j] = thread_ptr[i].slurm_addr; strncpy(&agent_arg_ptr->node_names[j * MAX_NAME_LEN], diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 57a2765405b..14fa07b5bc7 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1194,6 +1194,32 @@ void node_not_resp (char *name) return; } +/* set_node_down - make the specified node's state DOWN, kill jobs as needed + * IN name - name of the node */ +void set_node_down (char *name) +{ + struct node_record *node_ptr; + int node_inx; + uint16_t resp_state; + + node_ptr = find_node_record (name); + if (node_ptr == NULL) { + error ("node_not_resp unable to find node %s", name); + return; + } + + node_inx = node_ptr - node_record_table_ptr; + last_node_update = time (NULL); + /* preserve NODE_STATE_NO_RESPOND flag if set */ + resp_state = node_ptr->node_state & NODE_STATE_NO_RESPOND; + node_ptr->node_state = NODE_STATE_DOWN | resp_state; + bit_clear (up_node_bitmap, node_inx); + bit_clear (idle_node_bitmap, node_inx); + (void) kill_running_job_by_node_name(name, false); + + return; +} + /* ping_nodes - check that all nodes and daemons are alive, * get nodes in UNKNOWN state to register */ void ping_nodes (void) diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index cc01d77552e..fdd3ae0fc60 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -211,7 +211,9 @@ void make_node_idle(struct node_record *node_ptr) base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND); no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND; - if (base_state == NODE_STATE_DRAINING) { + if (base_state == NODE_STATE_DOWN) { + debug3("Node %s being left DOWN", node_ptr->name); + } else if (base_state == NODE_STATE_DRAINING) { node_ptr->node_state = NODE_STATE_DRAINED; bit_clear(idle_node_bitmap, inx); bit_clear(up_node_bitmap, inx); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 27b78407c64..3f985ba7ee8 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -883,6 +883,10 @@ extern int select_nodes (struct job_record *job_ptr, bool test_only); */ extern int set_batch_job_sid(uid_t uid, uint32_t job_id, uint32_t batch_sid); +/* set_node_down - make the specified node's state DOWN + * IN name - name of the node */ +extern void set_node_down (char *name); + /* set_slurmd_addr - establish the slurm_addr for the slurmd on each node * Uses common data structures. */ extern void set_slurmd_addr (void); -- GitLab