From 6f6ea3ee7659a64ed3edafcc2584dcca705a5a1b Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 15 May 2006 22:05:00 +0000
Subject: [PATCH] Test for run of slurmd ping ever SlurmdTimeout/3 and actually
 ping if last response >SlurmdTimeout/2 ago.

---
 src/slurmctld/controller.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index f60dc11dda6..2a1a1e14c54 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -732,10 +732,13 @@ static void *_slurmctld_background(void *no_data)
 	now = time(NULL);
 	last_sched_time = last_checkpoint_time = last_group_time = now;
 	last_timelimit_time = last_assert_primary_time = now;
-	if (slurmctld_conf.slurmd_timeout)
-		ping_interval = slurmctld_conf.slurmd_timeout / 2;
-	else
-		ping_interval = 60;
+	if (slurmctld_conf.slurmd_timeout) {
+		/* We ping nodes that haven't responded in SlurmdTimeout/2,
+		 * but need to do the test at a higher frequency or we might
+		 * DOWN nodes with times that fall in the gap. */
+		ping_interval = slurmctld_conf.slurmd_timeout / 3;
+	} else
+		ping_interval = 60 * 60 * 24 * 356;	/* one year */
 	last_ping_node_time = now + (time_t)MIN_CHECKIN_TIME - ping_interval;
 	last_ping_srun_time = now;
 	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
-- 
GitLab