From d2b3aec31530b9dc08093106a3ae631fdabcd700 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 3 Jun 2010 17:41:39 +0000
Subject: [PATCH] fix timing problem with slurmd's new prolog timout logic

---
 src/slurmd/slurmd/req.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index f67187a333c..b2d4bfe480a 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -114,7 +114,9 @@ typedef struct {
 typedef struct {
 	uint32_t job_id;
 	uint16_t msg_timeout;
+	bool *prolog_fini;
 	pthread_cond_t *timer_cond;
+	pthread_mutex_t *timer_mutex;
 } timer_struct_t;
 
 static int  _abort_job(uint32_t job_id);
@@ -3576,21 +3578,23 @@ _destroy_env(char **env)
 
 static void *_prolog_timer(void *x)
 {
-	int delay_time, rc;
+	int delay_time, rc = SLURM_SUCCESS;
 	struct timespec abs_time;
 	slurm_msg_t msg;
 	job_notify_msg_t notify_req;
 	char srun_msg[128];
-	pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
 	timer_struct_t *timer_struct = (timer_struct_t *) x;
 
 	delay_time = MAX(2, (timer_struct->msg_timeout - 2));
 	abs_time.tv_sec  = time(NULL) + delay_time;
 	abs_time.tv_nsec = 0;
-	slurm_mutex_lock(&timer_mutex);
-	rc = pthread_cond_timedwait(timer_struct->timer_cond, &timer_mutex,
-				    &abs_time);
-	slurm_mutex_unlock(&timer_mutex);
+	slurm_mutex_lock(timer_struct->timer_mutex);
+	if (!timer_struct->prolog_fini) {
+		rc = pthread_cond_timedwait(timer_struct->timer_cond,
+					    timer_struct->timer_mutex,
+					    &abs_time);
+	}
+	slurm_mutex_unlock(timer_struct->timer_mutex);
 
 	if (rc != ETIMEDOUT)
 		return NULL;
@@ -3617,10 +3621,12 @@ _run_prolog(uint32_t jobid, uid_t uid, char *resv_id,
 				   spank_job_env_size);
 	time_t start_time = time(NULL), diff_time;
 	static uint16_t msg_timeout = 0;
-	pthread_t      timer_id;
-	pthread_attr_t timer_attr;
-	pthread_cond_t timer_cond = PTHREAD_COND_INITIALIZER;
-	timer_struct_t timer_struct;
+	pthread_t       timer_id;
+	pthread_attr_t  timer_attr;
+	pthread_cond_t  timer_cond  = PTHREAD_COND_INITIALIZER;
+	pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
+	timer_struct_t  timer_struct;
+	bool prolog_fini = false;
 
 	if (msg_timeout == 0)
 		msg_timeout = slurm_get_msg_timeout();
@@ -3633,10 +3639,15 @@ _run_prolog(uint32_t jobid, uid_t uid, char *resv_id,
 	slurm_attr_init(&timer_attr);
 	timer_struct.job_id      = jobid;
 	timer_struct.msg_timeout = msg_timeout;
+	timer_struct.prolog_fini = &prolog_fini;
 	timer_struct.timer_cond  = &timer_cond;
+	timer_struct.timer_mutex = &timer_mutex;
 	pthread_create(&timer_id, &timer_attr, &_prolog_timer, &timer_struct);
 	rc = run_script("prolog", my_prolog, jobid, -1, my_env);
+	slurm_mutex_lock(&timer_mutex);
+	prolog_fini = true;
 	pthread_cond_broadcast(&timer_cond);
+	slurm_mutex_unlock(&timer_mutex);
 	_remove_job_running_prolog(jobid);
 	xfree(my_prolog);
 	_destroy_env(my_env);
-- 
GitLab