From 2f34113d6a28890d59924e96f0d26685d72f6c6b Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 11 Aug 2008 20:00:10 +0000
Subject: [PATCH] If a DOWN node is resumed, set its state to IDLE &
 NOT_RESPONDING and     ping the node immediately to clear the NOT_RESPONDING
 flag.

---
 NEWS                       | 2 ++
 src/slurmctld/controller.c | 6 ++++--
 src/slurmctld/node_mgr.c   | 8 ++++++--
 src/slurmctld/ping_nodes.c | 3 ++-
 src/slurmctld/slurmctld.h  | 1 +
 5 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/NEWS b/NEWS
index 93271a2798b..4a46f925e37 100644
--- a/NEWS
+++ b/NEWS
@@ -32,6 +32,8 @@ documents those changes that are of interest to users and admins.
     expression rather than one line per node. Frequency of log messages is 
     dependent upon SlurmctldDebug value from 300 seconds at SlurmctldDebug<=3
     to 1 second at SlurmctldDebug>=5.
+ -- If a DOWN node is resumed, set its state to IDLE & NOT_RESPONDING and 
+    ping the node immediately to clear the NOT_RESPONDING flag.
  
 * Changes in SLURM 1.3.6
 ========================
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index c001feaeb49..335eb978859 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -143,6 +143,7 @@ int bg_recover = DEFAULT_RECOVER;
 char *slurmctld_cluster_name = NULL; /* name of cluster */
 void *acct_db_conn = NULL;
 int accounting_enforce = 0;
+bool ping_nodes_now = false;
 
 /* Local variables */
 static int	daemonize = DEFAULT_DAEMONIZE;
@@ -1097,12 +1098,13 @@ static void *_slurmctld_background(void *no_data)
 				unlock_slurmctld(node_write_lock);
 			}
 		}
-
-		if (difftime(now, last_ping_node_time) >= ping_interval) {
+		if ((difftime(now, last_ping_node_time) >= ping_interval) ||
+		    ping_nodes_now) {
 			static bool msg_sent = false;
 			if (is_ping_done()) {
 				msg_sent = false;
 				last_ping_node_time = now;
+				ping_nodes_now = false;
 				lock_slurmctld(node_write_lock);
 				ping_nodes();
 				unlock_slurmctld(node_write_lock);
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index 7283b0f0a9f..bd523c83380 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -1067,9 +1067,13 @@ int update_node ( update_node_msg_t * update_node_msg )
 				node_ptr->node_state &= (~NODE_STATE_DRAIN);
 				node_ptr->node_state &= (~NODE_STATE_FAIL);
 				base_state &= NODE_STATE_BASE;
-				if (base_state == NODE_STATE_DOWN)
+				if (base_state == NODE_STATE_DOWN) {
 					state_val = NODE_STATE_IDLE;
-				else
+					node_ptr->node_state |= 
+							NODE_STATE_NO_RESPOND;
+					node_ptr->last_response = now;
+					ping_nodes_now = true;
+				} else
 					state_val = base_state;
 			}
 			if (state_val == NODE_STATE_DOWN) {
diff --git a/src/slurmctld/ping_nodes.c b/src/slurmctld/ping_nodes.c
index fbbd8d86310..1317f9ae153 100644
--- a/src/slurmctld/ping_nodes.c
+++ b/src/slurmctld/ping_nodes.c
@@ -217,7 +217,8 @@ void ping_nodes (void)
 			continue;
 		}
 
-		if (node_ptr->last_response >= still_live_time)
+		if ((!no_resp_flag) && 
+		    (node_ptr->last_response >= still_live_time))
 			continue;
 
 		/* Do not keep pinging down nodes since this can induce
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index beea2391f5b..96268279ba3 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -237,6 +237,7 @@ extern uint32_t total_cpus;		/* count of CPUs in the entire cluster */
 extern bitstr_t *idle_node_bitmap;	/* bitmap of idle nodes */
 extern bitstr_t *share_node_bitmap;	/* bitmap of sharable nodes */
 extern bitstr_t *up_node_bitmap;	/* bitmap of up nodes, not DOWN */
+extern bool ping_nodes_now;		/* if set, ping nodes immediately */
 
 /*****************************************************************************\
  *  PARTITION parameters and data structures
-- 
GitLab