diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 140d8af2cb497192a82afa25a6fb840892758852..73e76b214f02173661c0f10decd228af0005d264 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1014,13 +1014,18 @@ static void *_slurmctld_background(void *no_data) last_sched_time = last_checkpoint_time = last_group_time = now; last_purge_job_time = last_trigger = now; last_timelimit_time = last_assert_primary_time = now; - if (slurmctld_conf.slurmd_timeout) { - /* We ping nodes that haven't responded in SlurmdTimeout/2, + if (slurmctld_conf.slurmd_timeout || + slurmctld_conf.health_check_interval) { + /* We ping nodes that haven't responded in SlurmdTimeout/3, * but need to do the test at a higher frequency or we might * DOWN nodes with times that fall in the gap. */ - ping_interval = slurmctld_conf.slurmd_timeout / 3; - } else - ping_interval = 60 * 60 * 24 * 356; /* one year */ + ping_interval = MIN((slurmctld_conf.slurmd_timeout/3), + slurmctld_conf.health_check_interval); + } else { + /* This will just ping non-responding nodes + * and restore them to service */ + ping_interval = 100; /* 100 seconds */ + } last_ping_node_time = now + (time_t)MIN_CHECKIN_TIME - ping_interval; last_ping_srun_time = now; last_node_acct = now;