Skip to content
Snippets Groups Projects
Commit 9d351634 authored by Moe Jette's avatar Moe Jette
Browse files

Only report a node's response error once. Don't keep reporting

"Can't connect to node" with every ping failure.
parent 014463e4
No related branches found
No related tags found
No related merge requests found
...@@ -73,6 +73,7 @@ ...@@ -73,6 +73,7 @@
#include "src/slurmctld/agent.h" #include "src/slurmctld/agent.h"
#include "src/slurmctld/locks.h" #include "src/slurmctld/locks.h"
#include "src/slurmctld/ping_nodes.h" #include "src/slurmctld/ping_nodes.h"
#include "src/slurmctld/slurmctld.h"
#if COMMAND_TIMEOUT == 1 #if COMMAND_TIMEOUT == 1
# define WDOG_POLL 1 /* secs */ # define WDOG_POLL 1 /* secs */
...@@ -126,6 +127,7 @@ typedef struct task_info { ...@@ -126,6 +127,7 @@ typedef struct task_info {
} task_info_t; } task_info_t;
static void _alarm_handler(int dummy); static void _alarm_handler(int dummy);
static inline void _comm_err(char *node_name);
static void _list_delete_retry(void *retry_entry); static void _list_delete_retry(void *retry_entry);
static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr); static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr);
static task_info_t *_make_task_data(agent_info_t *agent_info_ptr, int inx); static task_info_t *_make_task_data(agent_info_t *agent_info_ptr, int inx);
...@@ -472,6 +474,15 @@ static void *_wdog(void *args) ...@@ -472,6 +474,15 @@ static void *_wdog(void *args)
return (void *) NULL; return (void *) NULL;
} }
/* Report a communications error for specified node */
static inline void _comm_err(char *node_name)
{
#if AGENT_IS_THREAD
if (is_node_resp (node_name))
#endif
error("agent/send_recv_msg: %s: %m", node_name);
}
/* /*
* _thread_per_node_rpc - thread to issue an RPC on a collection of nodes * _thread_per_node_rpc - thread to issue an RPC on a collection of nodes
* IN/OUT args - pointer to task_info_t, xfree'd on completion * IN/OUT args - pointer to task_info_t, xfree'd on completion
...@@ -520,12 +531,12 @@ static void *_thread_per_node_rpc(void *args) ...@@ -520,12 +531,12 @@ static void *_thread_per_node_rpc(void *args)
if (task_ptr->get_reply) { if (task_ptr->get_reply) {
if (slurm_send_recv_rc_msg(&msg, &rc, timeout) < 0) { if (slurm_send_recv_rc_msg(&msg, &rc, timeout) < 0) {
error("agent: %s: %m", thread_ptr->node_name); _comm_err(thread_ptr->node_name);
goto cleanup; goto cleanup;
} }
} else { } else {
if (slurm_send_only_node_msg(&msg) < 0) if (slurm_send_only_node_msg(&msg) < 0)
error("agent: %s: %m", thread_ptr->node_name); _comm_err(thread_ptr->node_name);
else else
thread_state = DSH_DONE; thread_state = DSH_DONE;
goto cleanup; goto cleanup;
......
...@@ -1036,7 +1036,7 @@ validate_node_specs (char *node_name, uint32_t cpus, ...@@ -1036,7 +1036,7 @@ validate_node_specs (char *node_name, uint32_t cpus,
struct config_record *config_ptr; struct config_record *config_ptr;
struct node_record *node_ptr; struct node_record *node_ptr;
uint16_t resp_state; uint16_t resp_state;
char *reason_down; char *reason_down = NULL;
node_ptr = find_node_record (node_name); node_ptr = find_node_record (node_name);
if (node_ptr == NULL) if (node_ptr == NULL)
...@@ -1263,6 +1263,26 @@ bool is_node_down (char *name) ...@@ -1263,6 +1263,26 @@ bool is_node_down (char *name)
return false; return false;
} }
/*
* is_node_resp - determine if the specified node's state is responding
* IN name - name of the node
* RET true if node exists and is responding, otherwise false
*/
bool is_node_resp (char *name)
{
struct node_record *node_ptr;
node_ptr = find_node_record (name);
if (node_ptr == NULL) {
error ("is_node_resp unable to find node %s", name);
return false;
}
if (node_ptr->node_state & NODE_STATE_NO_RESPOND)
return false;
return true;
}
/* /*
* find_first_node_record - find a record for first node in the bitmap * find_first_node_record - find a record for first node in the bitmap
* IN node_bitmap * IN node_bitmap
......
...@@ -569,6 +569,13 @@ extern int init_part_conf (void); ...@@ -569,6 +569,13 @@ extern int init_part_conf (void);
*/ */
extern bool is_node_down (char *name); extern bool is_node_down (char *name);
/*
* is_node_resp - determine if the specified node's state is responding
* IN name - name of the node
* RET true if node exists and is responding, otherwise false
*/
extern bool is_node_resp (char *name);
/* /*
* job_allocate - create job_records for the suppied job specification and * job_allocate - create job_records for the suppied job specification and
* allocate nodes for it. * allocate nodes for it.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment