From 9d351634135eb6cf90c0091fb7f6190497aae98e Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Sat, 2 Aug 2003 01:33:32 +0000
Subject: [PATCH] Only report a node's response error once. Don't keep
 reporting "Can't connect to node" with every ping failure.

---
 src/slurmctld/agent.c     | 15 +++++++++++++--
 src/slurmctld/node_mgr.c  | 22 +++++++++++++++++++++-
 src/slurmctld/slurmctld.h |  7 +++++++
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 135c0f9ded3..4ba0c2f6f73 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -73,6 +73,7 @@
 #include "src/slurmctld/agent.h"
 #include "src/slurmctld/locks.h"
 #include "src/slurmctld/ping_nodes.h"
+#include "src/slurmctld/slurmctld.h"
 
 #if COMMAND_TIMEOUT == 1
 #  define WDOG_POLL 		1	/* secs */
@@ -126,6 +127,7 @@ typedef struct task_info {
 } task_info_t;
 
 static void _alarm_handler(int dummy);
+static inline void _comm_err(char *node_name);
 static void _list_delete_retry(void *retry_entry);
 static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr);
 static task_info_t *_make_task_data(agent_info_t *agent_info_ptr, int inx);
@@ -472,6 +474,15 @@ static void *_wdog(void *args)
 	return (void *) NULL;
 }
 
+/* Report a communications error for specified node */
+static inline void _comm_err(char *node_name)
+{
+#if AGENT_IS_THREAD
+	if (is_node_resp (node_name))
+#endif
+		error("agent/send_recv_msg: %s: %m", node_name);
+}
+
 /*
  * _thread_per_node_rpc - thread to issue an RPC on a collection of nodes
  * IN/OUT args - pointer to task_info_t, xfree'd on completion
@@ -520,12 +531,12 @@ static void *_thread_per_node_rpc(void *args)
 
 	if (task_ptr->get_reply) {
 		if (slurm_send_recv_rc_msg(&msg, &rc, timeout) < 0) {
-			error("agent: %s: %m", thread_ptr->node_name);
+			_comm_err(thread_ptr->node_name);
 			goto cleanup;
 		}
 	} else {
 		if (slurm_send_only_node_msg(&msg) < 0)
-			error("agent: %s: %m", thread_ptr->node_name);
+			_comm_err(thread_ptr->node_name);
 		else
 			thread_state = DSH_DONE;
 		goto cleanup;
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index b466a6b18f4..ad62f5b5001 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -1036,7 +1036,7 @@ validate_node_specs (char *node_name, uint32_t cpus,
 	struct config_record *config_ptr;
 	struct node_record *node_ptr;
 	uint16_t resp_state;
-	char *reason_down;
+	char *reason_down = NULL;
 
 	node_ptr = find_node_record (node_name);
 	if (node_ptr == NULL)
@@ -1263,6 +1263,26 @@ bool is_node_down (char *name)
 	return false;
 }
 
+/*
+ * is_node_resp - determine if the specified node's state is responding
+ * IN name - name of the node
+ * RET true if node exists and is responding, otherwise false 
+ */
+bool is_node_resp (char *name)
+{
+	struct node_record *node_ptr;
+
+	node_ptr = find_node_record (name);
+	if (node_ptr == NULL) {
+		error ("is_node_resp unable to find node %s", name);
+		return false;
+	}
+
+	if (node_ptr->node_state & NODE_STATE_NO_RESPOND)
+		return false;
+	return true;
+}
+
 /*
  * find_first_node_record - find a record for first node in the bitmap
  * IN node_bitmap
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 765c36b318a..9d368b1779d 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -569,6 +569,13 @@ extern int init_part_conf (void);
  */
 extern bool is_node_down (char *name);
 
+/*
+ * is_node_resp - determine if the specified node's state is responding
+ * IN name - name of the node
+ * RET true if node exists and is responding, otherwise false 
+ */
+extern bool is_node_resp (char *name);
+
 /*
  * job_allocate - create job_records for the suppied job specification and 
  *	allocate nodes for it.
-- 
GitLab