From 6f6ea3ee7659a64ed3edafcc2584dcca705a5a1b Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 15 May 2006 22:05:00 +0000 Subject: [PATCH] Test for run of slurmd ping ever SlurmdTimeout/3 and actually ping if last response >SlurmdTimeout/2 ago. --- src/slurmctld/controller.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index f60dc11dda6..2a1a1e14c54 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -732,10 +732,13 @@ static void *_slurmctld_background(void *no_data) now = time(NULL); last_sched_time = last_checkpoint_time = last_group_time = now; last_timelimit_time = last_assert_primary_time = now; - if (slurmctld_conf.slurmd_timeout) - ping_interval = slurmctld_conf.slurmd_timeout / 2; - else - ping_interval = 60; + if (slurmctld_conf.slurmd_timeout) { + /* We ping nodes that haven't responded in SlurmdTimeout/2, + * but need to do the test at a higher frequency or we might + * DOWN nodes with times that fall in the gap. */ + ping_interval = slurmctld_conf.slurmd_timeout / 3; + } else + ping_interval = 60 * 60 * 24 * 356; /* one year */ last_ping_node_time = now + (time_t)MIN_CHECKIN_TIME - ping_interval; last_ping_srun_time = now; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); -- GitLab