From 9d351634135eb6cf90c0091fb7f6190497aae98e Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Sat, 2 Aug 2003 01:33:32 +0000 Subject: [PATCH] Only report a node's response error once. Don't keep reporting "Can't connect to node" with every ping failure. --- src/slurmctld/agent.c | 15 +++++++++++++-- src/slurmctld/node_mgr.c | 22 +++++++++++++++++++++- src/slurmctld/slurmctld.h | 7 +++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 135c0f9ded3..4ba0c2f6f73 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -73,6 +73,7 @@ #include "src/slurmctld/agent.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/ping_nodes.h" +#include "src/slurmctld/slurmctld.h" #if COMMAND_TIMEOUT == 1 # define WDOG_POLL 1 /* secs */ @@ -126,6 +127,7 @@ typedef struct task_info { } task_info_t; static void _alarm_handler(int dummy); +static inline void _comm_err(char *node_name); static void _list_delete_retry(void *retry_entry); static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr); static task_info_t *_make_task_data(agent_info_t *agent_info_ptr, int inx); @@ -472,6 +474,15 @@ static void *_wdog(void *args) return (void *) NULL; } +/* Report a communications error for specified node */ +static inline void _comm_err(char *node_name) +{ +#if AGENT_IS_THREAD + if (is_node_resp (node_name)) +#endif + error("agent/send_recv_msg: %s: %m", node_name); +} + /* * _thread_per_node_rpc - thread to issue an RPC on a collection of nodes * IN/OUT args - pointer to task_info_t, xfree'd on completion @@ -520,12 +531,12 @@ static void *_thread_per_node_rpc(void *args) if (task_ptr->get_reply) { if (slurm_send_recv_rc_msg(&msg, &rc, timeout) < 0) { - error("agent: %s: %m", thread_ptr->node_name); + _comm_err(thread_ptr->node_name); goto cleanup; } } else { if (slurm_send_only_node_msg(&msg) < 0) - error("agent: %s: %m", thread_ptr->node_name); + _comm_err(thread_ptr->node_name); else thread_state = DSH_DONE; goto cleanup; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index b466a6b18f4..ad62f5b5001 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1036,7 +1036,7 @@ validate_node_specs (char *node_name, uint32_t cpus, struct config_record *config_ptr; struct node_record *node_ptr; uint16_t resp_state; - char *reason_down; + char *reason_down = NULL; node_ptr = find_node_record (node_name); if (node_ptr == NULL) @@ -1263,6 +1263,26 @@ bool is_node_down (char *name) return false; } +/* + * is_node_resp - determine if the specified node's state is responding + * IN name - name of the node + * RET true if node exists and is responding, otherwise false + */ +bool is_node_resp (char *name) +{ + struct node_record *node_ptr; + + node_ptr = find_node_record (name); + if (node_ptr == NULL) { + error ("is_node_resp unable to find node %s", name); + return false; + } + + if (node_ptr->node_state & NODE_STATE_NO_RESPOND) + return false; + return true; +} + /* * find_first_node_record - find a record for first node in the bitmap * IN node_bitmap diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 765c36b318a..9d368b1779d 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -569,6 +569,13 @@ extern int init_part_conf (void); */ extern bool is_node_down (char *name); +/* + * is_node_resp - determine if the specified node's state is responding + * IN name - name of the node + * RET true if node exists and is responding, otherwise false + */ +extern bool is_node_resp (char *name); + /* * job_allocate - create job_records for the suppied job specification and * allocate nodes for it. -- GitLab