From 9bf2a357b2cc26c1fdae44b3a602a0ed2fb12853 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@schedmd.com>
Date: Tue, 3 Dec 2013 18:18:25 -0600
Subject: [PATCH] CRAY - Fix race condition if trying to signal or wait on a
 job container that never had a pid added to it.

(The job ended before it began)
---
 src/plugins/proctrack/cray/proctrack_cray.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/plugins/proctrack/cray/proctrack_cray.c b/src/plugins/proctrack/cray/proctrack_cray.c
index 800e19aac97..e933b09dec3 100644
--- a/src/plugins/proctrack/cray/proctrack_cray.c
+++ b/src/plugins/proctrack/cray/proctrack_cray.c
@@ -204,9 +204,16 @@ int proctrack_p_plugin_add(stepd_step_rec_t *job, pid_t pid)
 
 int proctrack_p_plugin_signal(uint64_t id, int sig)
 {
-	if ((job_killjid((jid_t) id, sig) < 0)
-	   && (errno != ENODATA) && (errno != EBADF) )
-		return (SLURM_ERROR);
+	if (!threadid) {
+		if ((job_killjid((jid_t) id, sig) < 0)
+		    && (errno != ENODATA) && (errno != EBADF) )
+			return (SLURM_ERROR);
+	} else if (sig == SIGKILL) {
+		/* job ended before it started */
+		_end_container_thread();
+	} else
+		error("Trying to send signal %d a container 0x%08lx "
+		      "that hasn't had anything added to it yet", sig, id);
 	return (SLURM_SUCCESS);
 }
 
@@ -216,7 +223,9 @@ int proctrack_p_plugin_destroy(uint64_t id)
 
 	debug("destroying 0x%08lx 0x%08lx", id, threadid);
 
-	job_waitjid((jid_t) id, &status, 0);
+	if (!threadid)
+		job_waitjid((jid_t) id, &status, 0);
+
 	/*  Assume any error means job doesn't exist. Therefore,
 	 *   return SUCCESS to slurmd so it doesn't retry continuously
 	 */
@@ -248,7 +257,8 @@ bool proctrack_p_plugin_has_pid (uint64_t cont_id, pid_t pid)
 int proctrack_p_plugin_wait(uint64_t id)
 {
 	int status;
-	if (job_waitjid((jid_t) id, &status, 0) == (jid_t)-1)
+
+	if (!threadid && job_waitjid((jid_t) id, &status, 0) == (jid_t)-1)
 		return SLURM_ERROR;
 
 	return SLURM_SUCCESS;
-- 
GitLab