diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index f60dc11dda65ad2096fb72a2dc92053158cceced..2a1a1e14c54f7f38e28c68462d43d431953f3386 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -732,10 +732,13 @@ static void *_slurmctld_background(void *no_data) now = time(NULL); last_sched_time = last_checkpoint_time = last_group_time = now; last_timelimit_time = last_assert_primary_time = now; - if (slurmctld_conf.slurmd_timeout) - ping_interval = slurmctld_conf.slurmd_timeout / 2; - else - ping_interval = 60; + if (slurmctld_conf.slurmd_timeout) { + /* We ping nodes that haven't responded in SlurmdTimeout/2, + * but need to do the test at a higher frequency or we might + * DOWN nodes with times that fall in the gap. */ + ping_interval = slurmctld_conf.slurmd_timeout / 3; + } else + ping_interval = 60 * 60 * 24 * 356; /* one year */ last_ping_node_time = now + (time_t)MIN_CHECKIN_TIME - ping_interval; last_ping_srun_time = now; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);