From 4749a9543d516a21ece0adfca89c448a78fee65b Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 29 Nov 2010 21:48:32 +0000
Subject: [PATCH] tweak the srun logic to handle the max wait time after first
 task exits. the SIGALRM handling needed to be changed

---
 src/srun/srun.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/srun/srun.c b/src/srun/srun.c
index e1c9928c5f6..eb098e5a863 100644
--- a/src/srun/srun.c
+++ b/src/srun/srun.c
@@ -129,7 +129,8 @@ time_t launch_start_time;
 bool   retry_step_begin = false;
 int    retry_step_cnt = 0;
 
-bool srun_shutdown = false;
+bool srun_max_timer = false;
+bool srun_shutdown  = false;
 int sig_array[] = {
 	SIGINT,  SIGQUIT, SIGCONT, SIGTERM, SIGCONT,
 	SIGALRM, SIGUSR1, SIGUSR2, SIGPIPE, 0 };
@@ -1129,14 +1130,6 @@ _terminate_job_step(slurm_step_ctx_t *step_ctx)
 	slurm_kill_job_step(job_id, step_id, SIGKILL);
 }
 
-static void
-_handle_max_wait(int signo)
-{
-	info("First task exited %ds ago", opt.max_wait);
-	task_state_print(task_state, (log_f) info);
-	_terminate_job_step(job->step_ctx);
-}
-
 static char *
 _hostset_to_string(hostset_t hs)
 {
@@ -1235,7 +1228,7 @@ static void _setup_max_wait_timer(void)
 	 *   tasks don't finish within opt.max_wait seconds.
 	 */
 	verbose("First task exited. Terminating job in %ds.", opt.max_wait);
-	xsignal(SIGALRM, _handle_max_wait);
+	srun_max_timer = true;
 	alarm(opt.max_wait);
 }
 
@@ -1448,6 +1441,13 @@ static void *_srun_signal_mgr(void *no_data)
 		case SIGPIPE:
 			_handle_pipe();
 			break;
+		case SIGALRM:
+			if (srun_max_timer) {
+				info("First task exited %ds ago", opt.max_wait);
+				task_state_print(task_state, (log_f) info);
+				_terminate_job_step(job->step_ctx);
+			}
+			break;
 		default:
 			slurm_step_launch_fwd_signal(job->step_ctx, sig);
 			break;
-- 
GitLab