diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 4fd41c1552a502b2930b2037fc21149fe55d1217..885b0a58eb39c72e5a5addfeac349230cdc4d024 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -729,6 +729,7 @@ static void *_slurmctld_background(void *no_data) static time_t last_group_time; static time_t last_ping_node_time; static time_t last_ping_srun_time; + static time_t last_purge_job_time; static time_t last_timelimit_time; static time_t last_assert_primary_time; time_t now; @@ -755,6 +756,7 @@ static void *_slurmctld_background(void *no_data) /* Let the dust settle before doing work */ now = time(NULL); last_sched_time = last_checkpoint_time = last_group_time = now; + last_purge_job_time = now; last_timelimit_time = last_assert_primary_time = now; if (slurmctld_conf.slurmd_timeout) { /* We ping nodes that haven't responded in SlurmdTimeout/2, @@ -830,12 +832,16 @@ static void *_slurmctld_background(void *no_data) unlock_slurmctld(part_write_lock); } - if (difftime(now, last_sched_time) >= PERIODIC_SCHEDULE) { - last_sched_time = now; + if (difftime(now, last_purge_job_time) >= PURGE_JOB_INTERVAL) { + last_purge_job_time = now; debug2("Performing purge of old job records"); lock_slurmctld(job_write_lock); - purge_old_job(); /* remove defunct job recs */ + purge_old_job(); unlock_slurmctld(job_write_lock); + } + + if (difftime(now, last_sched_time) >= PERIODIC_SCHEDULE) { + last_sched_time = now; if (schedule()) last_checkpoint_time = 0; /* force state save */ } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index f38f32addb55c2eeef9b5f2fc00d17af9978f92a..d0ae1c968c6a09cdf16bc906beeedc495c847e5a 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -108,6 +108,10 @@ /* Check for jobs reaching their time limit every PERIODIC_TIMEOUT seconds */ #define PERIODIC_TIMEOUT 60 +/* Attempt to purge defunct job records and resend job kill requests + * every PURGE_JOB_INTERVAL seconds */ +#define PURGE_JOB_INTERVAL 300 + /* Pathname of group file record for checking update times */ #define GROUP_FILE "/etc/group"