From 58d0267c15b7fdb4160a79260f50477b03c9c370 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 1 Oct 2004 20:33:38 +0000
Subject: [PATCH] Convert more node logging functions to use hostlist,
 especially those used on Blue Gene systems. This avoid a redundant message
 for each and every back-end node.

---
 src/slurmctld/node_mgr.c   | 69 +++++++++++++++++++++++++++++++-------
 src/slurmctld/ping_nodes.c |  2 +-
 2 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index ace9ed72b23..619b48d6f94 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -983,7 +983,6 @@ extern int drain_nodes ( char *nodes, char *reason )
 
 	last_node_update = time (NULL);
 	while ( (this_node_name = hostlist_shift (host_list)) ) {
-		int err_code = 0;
 		node_ptr = find_node_record (this_node_name);
 		node_inx = node_ptr - node_record_table_ptr;
 		if (node_ptr == NULL) {
@@ -1212,6 +1211,9 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 	struct node_record *node_ptr;
 	time_t now = time(NULL);
 	ListIterator job_iterator;
+	hostlist_t return_hostlist = NULL, reg_hostlist = NULL;
+	hostlist_t prolog_hostlist = NULL;
+	char host_str[64];
 
 	/* First validate the job info */
 	node_ptr = &node_record_table_ptr[0];	/* All msg send to node zero,
@@ -1249,7 +1251,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 
 		else if (job_ptr->job_state == JOB_PENDING) {
 			error("Registered PENDING job %u.%u",
-			      job_id_ptr[i], step_id_ptr[i]);
+				job_id_ptr[i], step_id_ptr[i]);
 			/* FIXME: Could possibly recover the job */
 			job_ptr->job_state = JOB_FAILED;
 			last_job_update    = now;
@@ -1260,10 +1262,9 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 		}
 
 		else {		/* else job is supposed to be done */
-			error
-			    ("Registered job %u.%u in state %s",
-			     job_id_ptr[i], step_id_ptr[i], 
-			     job_state_string(job_ptr->job_state));
+			error("Registered job %u.%u in state %s",
+				job_id_ptr[i], step_id_ptr[i], 
+				job_state_string(job_ptr->job_state));
 			kill_job_on_node(job_id_ptr[i], node_ptr);
 		}
 	}
@@ -1298,15 +1299,24 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 			if ((node_ptr->node_state != NODE_STATE_DRAINING) &&
 			    (node_ptr->node_state != NODE_STATE_DRAINED)) {
 				updated_job = true;
-				error ("Prolog failure on node %s, state to DOWN",
-					node_ptr->name);
+				if (prolog_hostlist)
+					(void) hostlist_push_host(
+						prolog_hostlist, 
+						node_ptr->name);
+				else
+					prolog_hostlist = hostlist_create(
+						node_ptr->name);
 				set_node_down(node_ptr->name, "Prolog failed");
 			}
 		} else {
 			if (node_ptr->node_state == NODE_STATE_UNKNOWN) {
 				updated_job = true;
-				debug("validate_node_specs: node %s has registered", 
-					node_ptr->name);
+				if (reg_hostlist)
+					(void) hostlist_push_host(
+						reg_hostlist, node_ptr->name);
+				else
+					reg_hostlist = hostlist_create(
+						node_ptr->name);
 				if (jobs_on_node)
 					node_ptr->node_state = NODE_STATE_ALLOCATED;
 				else
@@ -1329,8 +1339,12 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 					node_ptr->node_state = NODE_STATE_ALLOCATED;
 				else
 					node_ptr->node_state = NODE_STATE_IDLE;
-				info ("validate_node_specs: node %s returned to service", 
-				      node_ptr->name);
+				if (return_hostlist)
+					(void) hostlist_push_host(
+						return_hostlist, node_ptr->name);
+				else
+					return_hostlist = hostlist_create(
+						node_ptr->name);
 				xfree(node_ptr->reason);
 			} else if ((node_ptr->node_state == NODE_STATE_ALLOCATED) &&
 				   (jobs_on_node == 0)) {	/* job vanished */
@@ -1345,6 +1359,28 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 		}
 	}
 
+	if (prolog_hostlist) {
+		hostlist_uniq(prolog_hostlist);
+		hostlist_ranged_string(prolog_hostlist, sizeof(host_str),
+			host_str);
+		error("Prolog failure on nodes %s, set to DOWN", host_str);
+		hostlist_destroy(prolog_hostlist);
+	}
+	if (reg_hostlist) {
+		hostlist_uniq(reg_hostlist);
+		hostlist_ranged_string(reg_hostlist, sizeof(host_str),
+			host_str);
+		debug("Nodes %s have registerd", host_str);
+		hostlist_destroy(reg_hostlist);
+	}
+	if (return_hostlist) {
+		hostlist_uniq(return_hostlist);
+		hostlist_ranged_string(return_hostlist, sizeof(host_str),
+			host_str);
+		info("Nodes %s returned to service", host_str);
+		hostlist_destroy(return_hostlist);
+	}
+
 	if (updated_job) {
 		last_node_update = time (NULL);
 		reset_job_priority();
@@ -1451,17 +1487,25 @@ void node_not_resp (char *name, time_t msg_time)
 	struct node_record *node_ptr;
 #ifdef HAVE_BGL		/* only front-end node */
 	int i;
+	char host_str[64];
+	hostlist_t no_resp_hostlist = hostlist_create("");
 
 	for (i=0; i<node_record_count; i++) {
 		node_ptr = &node_record_table_ptr[i];
+		(void) hostlist_push_host(no_resp_hostlist, node_ptr->name);
 		_node_not_resp(node_ptr, msg_time);
 	}
+	hostlist_uniq(no_resp_hostlist);
+	hostlist_ranged_string(no_resp_hostlist, sizeof(host_str), host_str);
+	error("Nodes %s not responding", host_str);
+	hostlist_destroy(no_resp_hostlist);
 #else
 	node_ptr = find_node_record (name);
 	if (node_ptr == NULL) {
 		error ("node_not_resp unable to find node %s", name);
 		return;
 	}
+	error("Node %s not responding", node_ptr->name);
 	_node_not_resp(node_ptr, msg_time);
 #endif
 }
@@ -1480,7 +1524,6 @@ static void _node_not_resp (struct node_record *node_ptr, time_t msg_time)
 		return;
 	}
 	last_node_update = time (NULL);
-	error ("Node %s not responding", node_ptr->name);
 	bit_clear (avail_node_bitmap, i);
 	node_ptr->node_state |= NODE_STATE_NO_RESPOND;
 	return;
diff --git a/src/slurmctld/ping_nodes.c b/src/slurmctld/ping_nodes.c
index e5d6a36a40a..34f8fe48f6c 100644
--- a/src/slurmctld/ping_nodes.c
+++ b/src/slurmctld/ping_nodes.c
@@ -279,7 +279,7 @@ void ping_nodes (void)
 		hostlist_uniq(down_hostlist);
 		hostlist_ranged_string(down_hostlist,
 			sizeof(host_str), host_str);
-		error("Node %s not responding, setting DOWN", host_str);
+		error("Nodes %s not responding, setting DOWN", host_str);
 		hostlist_destroy(down_hostlist);
 	}
 	hostlist_destroy(ping_hostlist);
-- 
GitLab