Skip to content
Snippets Groups Projects
Commit 4749a954 authored by Moe Jette's avatar Moe Jette
Browse files

tweak the srun logic to handle the max wait time after first task exits.

the SIGALRM handling needed to be changed
parent 9b1632c1
No related branches found
No related tags found
No related merge requests found
...@@ -129,7 +129,8 @@ time_t launch_start_time; ...@@ -129,7 +129,8 @@ time_t launch_start_time;
bool retry_step_begin = false; bool retry_step_begin = false;
int retry_step_cnt = 0; int retry_step_cnt = 0;
bool srun_shutdown = false; bool srun_max_timer = false;
bool srun_shutdown = false;
int sig_array[] = { int sig_array[] = {
SIGINT, SIGQUIT, SIGCONT, SIGTERM, SIGCONT, SIGINT, SIGQUIT, SIGCONT, SIGTERM, SIGCONT,
SIGALRM, SIGUSR1, SIGUSR2, SIGPIPE, 0 }; SIGALRM, SIGUSR1, SIGUSR2, SIGPIPE, 0 };
...@@ -1129,14 +1130,6 @@ _terminate_job_step(slurm_step_ctx_t *step_ctx) ...@@ -1129,14 +1130,6 @@ _terminate_job_step(slurm_step_ctx_t *step_ctx)
slurm_kill_job_step(job_id, step_id, SIGKILL); slurm_kill_job_step(job_id, step_id, SIGKILL);
} }
static void
_handle_max_wait(int signo)
{
info("First task exited %ds ago", opt.max_wait);
task_state_print(task_state, (log_f) info);
_terminate_job_step(job->step_ctx);
}
static char * static char *
_hostset_to_string(hostset_t hs) _hostset_to_string(hostset_t hs)
{ {
...@@ -1235,7 +1228,7 @@ static void _setup_max_wait_timer(void) ...@@ -1235,7 +1228,7 @@ static void _setup_max_wait_timer(void)
* tasks don't finish within opt.max_wait seconds. * tasks don't finish within opt.max_wait seconds.
*/ */
verbose("First task exited. Terminating job in %ds.", opt.max_wait); verbose("First task exited. Terminating job in %ds.", opt.max_wait);
xsignal(SIGALRM, _handle_max_wait); srun_max_timer = true;
alarm(opt.max_wait); alarm(opt.max_wait);
} }
...@@ -1448,6 +1441,13 @@ static void *_srun_signal_mgr(void *no_data) ...@@ -1448,6 +1441,13 @@ static void *_srun_signal_mgr(void *no_data)
case SIGPIPE: case SIGPIPE:
_handle_pipe(); _handle_pipe();
break; break;
case SIGALRM:
if (srun_max_timer) {
info("First task exited %ds ago", opt.max_wait);
task_state_print(task_state, (log_f) info);
_terminate_job_step(job->step_ctx);
}
break;
default: default:
slurm_step_launch_fwd_signal(job->step_ctx, sig); slurm_step_launch_fwd_signal(job->step_ctx, sig);
break; break;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment