From 1ed38f267e7044b249fb29f5d90ab9100d2a4edc Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Fri, 18 Mar 2016 08:13:10 -0700
Subject: [PATCH] Fix for srun abort on SIGSTOP+SIGCONT

Avoid possibly aborting srun that gets simultaneous SIGSTOP+SIGCONT while
    creating the job step. The result is that the signal hanlder gets a
    argument (the signal received) of zero.

Here's a log, window 1:
$ srun hostname
srun: Job step creation temporarily disabled, retrying
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 18
srun: I Got signal 0
srun: Cancelled pending job step

Window 2:
$  kill -STOP 18696 ; kill -CONT 18696
$  kill -STOP 18696 ; kill -CONT 18696
$  kill -STOP 18696 ; kill -CONT 18696
....

bug 2494
---
 NEWS                        | 2 ++
 src/api/step_ctx.c          | 3 ++-
 src/srun/libsrun/allocate.c | 3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/NEWS b/NEWS
index 07caefd5807..c99c0c1f40d 100644
--- a/NEWS
+++ b/NEWS
@@ -60,6 +60,8 @@ documents those changes that are of interest to users and administrators.
  -- Update gang scheduling data structures when job changes in size.
  -- Associations - prevent hash table corruption if uid initially unset for
     a user, which can cause slurmctld to crash if that user is deleted.
+ -- Avoid possibly aborting srun that gets simultaneous SIGSTOP+SIGCONT while
+    creating the job step.
 
 * Changes in Slurm 15.08.8
 ==========================
diff --git a/src/api/step_ctx.c b/src/api/step_ctx.c
index 0302c6652f6..fe3be8b3e96 100644
--- a/src/api/step_ctx.c
+++ b/src/api/step_ctx.c
@@ -77,7 +77,8 @@ static int destroy_step = 0;
 static void _signal_while_allocating(int signo)
 {
 	debug("Got signal %d", signo);
-	if (signo == SIGCONT)
+	/* NOTE: Near simultaneous SIGSTOP+SIGCONT can result in signo == 0 */
+	if ((signo == SIGCONT) || (signo == 0))
 		return;
 
 	destroy_step = signo;
diff --git a/src/srun/libsrun/allocate.c b/src/srun/libsrun/allocate.c
index d74df0b5834..c861c9532d4 100644
--- a/src/srun/libsrun/allocate.c
+++ b/src/srun/libsrun/allocate.c
@@ -118,7 +118,8 @@ static void *_safe_signal_while_allocating(void *in_data)
 	int signo = *(int *)in_data;
 
 	debug("Got signal %d", signo);
-	if (signo == SIGCONT)
+	/* NOTE: Near simultaneous SIGSTOP+SIGCONT can result in signo == 0 */
+	if ((signo == SIGCONT) || (signo == 0))
 		return NULL;
 
 	destroy_job = 1;
-- 
GitLab