From ccf66079346cf74ff6942f7f57a69c35e41dd0ca Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Fri, 30 Sep 2016 09:31:31 -0600
Subject: [PATCH] Srun pending steps, decrease retry frequency

Previous logic would always retry in 60 to 69 secs (based upon srun PID).
  New logic will wait up to SlurmctldTimeout + 9 secs (minimum value 60
  seconds, maximum 309 seconds).
---
 src/common/slurm_protocol_api.c | 19 +++++++++++++++++++
 src/common/slurm_protocol_api.h |  7 +++++++
 src/srun/libsrun/launch.c       |  8 +++++---
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index c97cfa52223..1d4a258fc24 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -1510,6 +1510,25 @@ char *slurm_get_slurmd_plugstack(void)
 	return slurmd_plugstack;
 }
 
+/* slurm_get_slurmctld_timeout
+ * get slurmctld_timeout from slurmctld_conf object from
+ * slurmctld_conf object
+ * RET uint16_t - slurmctld timeout in seconds
+ */
+uint16_t slurm_get_slurmctld_timeout(void)
+{
+	uint16_t slurmctld_timeout = 0;
+	slurm_ctl_conf_t *conf;
+
+	if (slurmdbd_conf) {
+	} else {
+		conf = slurm_conf_lock();
+		slurmctld_timeout = conf->slurmctld_timeout;
+		slurm_conf_unlock();
+	}
+	return slurmctld_timeout;
+}
+
 /* slurm_get_accounting_storage_type
  * returns the accounting storage type from slurmctld_conf object
  * RET char *    - accounting storage type,  MUST be xfreed by caller
diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h
index 386d0493838..b9c8805ca16 100644
--- a/src/common/slurm_protocol_api.h
+++ b/src/common/slurm_protocol_api.h
@@ -278,6 +278,13 @@ char *slurm_get_slurmctld_plugstack(void);
  */
 char *slurm_get_slurmd_plugstack(void);
 
+/* slurm_get_slurmctld_timeout
+ * get slurmctld_timeout from slurmctld_conf object from
+ * slurmctld_conf object
+ * RET uint16_t - slurmctld timeout in seconds
+ */
+uint16_t slurm_get_slurmctld_timeout(void);
+
 /* slurm_get_plugin_dir
  * get plugin directory from slurmctld_conf object from slurmctld_conf object
  * RET char *   - plugin directory, MUST be xfreed by caller
diff --git a/src/srun/libsrun/launch.c b/src/srun/libsrun/launch.c
index ea0aeff6578..7513447620e 100644
--- a/src/srun/libsrun/launch.c
+++ b/src/srun/libsrun/launch.c
@@ -163,7 +163,7 @@ extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus,
 {
 	int i, j, rc;
 	unsigned long step_wait = 0;
-	uint16_t base_dist;
+	uint16_t base_dist, slurmctld_timeout;
 
 	if (!job) {
 		error("launch_common_create_job_step: no job given");
@@ -315,8 +315,10 @@ extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus,
 							    srun_begin_time)) *
 					    1000;
 			} else {
-				/* Wait 60 to 70 seconds for response */
-				step_wait = (getpid() % 10) * 1000 + 60000;
+				slurmctld_timeout = MIN(300, MAX(60,
+					slurm_get_slurmctld_timeout()));
+				step_wait = ((getpid() % 10) +
+					     slurmctld_timeout) * 1000;
 			}
 			job->step_ctx = slurm_step_ctx_create_timeout(
 						&job->ctx_params, step_wait);
-- 
GitLab