From 1ed38f267e7044b249fb29f5d90ab9100d2a4edc Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Fri, 18 Mar 2016 08:13:10 -0700 Subject: [PATCH] Fix for srun abort on SIGSTOP+SIGCONT Avoid possibly aborting srun that gets simultaneous SIGSTOP+SIGCONT while creating the job step. The result is that the signal hanlder gets a argument (the signal received) of zero. Here's a log, window 1: $ srun hostname srun: Job step creation temporarily disabled, retrying srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 18 srun: I Got signal 0 srun: Cancelled pending job step Window 2: $ kill -STOP 18696 ; kill -CONT 18696 $ kill -STOP 18696 ; kill -CONT 18696 $ kill -STOP 18696 ; kill -CONT 18696 .... bug 2494 --- NEWS | 2 ++ src/api/step_ctx.c | 3 ++- src/srun/libsrun/allocate.c | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index 07caefd5807..c99c0c1f40d 100644 --- a/NEWS +++ b/NEWS @@ -60,6 +60,8 @@ documents those changes that are of interest to users and administrators. -- Update gang scheduling data structures when job changes in size. -- Associations - prevent hash table corruption if uid initially unset for a user, which can cause slurmctld to crash if that user is deleted. + -- Avoid possibly aborting srun that gets simultaneous SIGSTOP+SIGCONT while + creating the job step. * Changes in Slurm 15.08.8 ========================== diff --git a/src/api/step_ctx.c b/src/api/step_ctx.c index 0302c6652f6..fe3be8b3e96 100644 --- a/src/api/step_ctx.c +++ b/src/api/step_ctx.c @@ -77,7 +77,8 @@ static int destroy_step = 0; static void _signal_while_allocating(int signo) { debug("Got signal %d", signo); - if (signo == SIGCONT) + /* NOTE: Near simultaneous SIGSTOP+SIGCONT can result in signo == 0 */ + if ((signo == SIGCONT) || (signo == 0)) return; destroy_step = signo; diff --git a/src/srun/libsrun/allocate.c b/src/srun/libsrun/allocate.c index d74df0b5834..c861c9532d4 100644 --- a/src/srun/libsrun/allocate.c +++ b/src/srun/libsrun/allocate.c @@ -118,7 +118,8 @@ static void *_safe_signal_while_allocating(void *in_data) int signo = *(int *)in_data; debug("Got signal %d", signo); - if (signo == SIGCONT) + /* NOTE: Near simultaneous SIGSTOP+SIGCONT can result in signo == 0 */ + if ((signo == SIGCONT) || (signo == 0)) return NULL; destroy_job = 1; -- GitLab