From ccf66079346cf74ff6942f7f57a69c35e41dd0ca Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Fri, 30 Sep 2016 09:31:31 -0600 Subject: [PATCH] Srun pending steps, decrease retry frequency Previous logic would always retry in 60 to 69 secs (based upon srun PID). New logic will wait up to SlurmctldTimeout + 9 secs (minimum value 60 seconds, maximum 309 seconds). --- src/common/slurm_protocol_api.c | 19 +++++++++++++++++++ src/common/slurm_protocol_api.h | 7 +++++++ src/srun/libsrun/launch.c | 8 +++++--- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index c97cfa52223..1d4a258fc24 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -1510,6 +1510,25 @@ char *slurm_get_slurmd_plugstack(void) return slurmd_plugstack; } +/* slurm_get_slurmctld_timeout + * get slurmctld_timeout from slurmctld_conf object from + * slurmctld_conf object + * RET uint16_t - slurmctld timeout in seconds + */ +uint16_t slurm_get_slurmctld_timeout(void) +{ + uint16_t slurmctld_timeout = 0; + slurm_ctl_conf_t *conf; + + if (slurmdbd_conf) { + } else { + conf = slurm_conf_lock(); + slurmctld_timeout = conf->slurmctld_timeout; + slurm_conf_unlock(); + } + return slurmctld_timeout; +} + /* slurm_get_accounting_storage_type * returns the accounting storage type from slurmctld_conf object * RET char * - accounting storage type, MUST be xfreed by caller diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 386d0493838..b9c8805ca16 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -278,6 +278,13 @@ char *slurm_get_slurmctld_plugstack(void); */ char *slurm_get_slurmd_plugstack(void); +/* slurm_get_slurmctld_timeout + * get slurmctld_timeout from slurmctld_conf object from + * slurmctld_conf object + * RET uint16_t - slurmctld timeout in seconds + */ +uint16_t slurm_get_slurmctld_timeout(void); + /* slurm_get_plugin_dir * get plugin directory from slurmctld_conf object from slurmctld_conf object * RET char * - plugin directory, MUST be xfreed by caller diff --git a/src/srun/libsrun/launch.c b/src/srun/libsrun/launch.c index ea0aeff6578..7513447620e 100644 --- a/src/srun/libsrun/launch.c +++ b/src/srun/libsrun/launch.c @@ -163,7 +163,7 @@ extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, { int i, j, rc; unsigned long step_wait = 0; - uint16_t base_dist; + uint16_t base_dist, slurmctld_timeout; if (!job) { error("launch_common_create_job_step: no job given"); @@ -315,8 +315,10 @@ extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, srun_begin_time)) * 1000; } else { - /* Wait 60 to 70 seconds for response */ - step_wait = (getpid() % 10) * 1000 + 60000; + slurmctld_timeout = MIN(300, MAX(60, + slurm_get_slurmctld_timeout())); + step_wait = ((getpid() % 10) + + slurmctld_timeout) * 1000; } job->step_ctx = slurm_step_ctx_create_timeout( &job->ctx_params, step_wait); -- GitLab