From 9bf2a357b2cc26c1fdae44b3a602a0ed2fb12853 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Tue, 3 Dec 2013 18:18:25 -0600 Subject: [PATCH] CRAY - Fix race condition if trying to signal or wait on a job container that never had a pid added to it. (The job ended before it began) --- src/plugins/proctrack/cray/proctrack_cray.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/plugins/proctrack/cray/proctrack_cray.c b/src/plugins/proctrack/cray/proctrack_cray.c index 800e19aac97..e933b09dec3 100644 --- a/src/plugins/proctrack/cray/proctrack_cray.c +++ b/src/plugins/proctrack/cray/proctrack_cray.c @@ -204,9 +204,16 @@ int proctrack_p_plugin_add(stepd_step_rec_t *job, pid_t pid) int proctrack_p_plugin_signal(uint64_t id, int sig) { - if ((job_killjid((jid_t) id, sig) < 0) - && (errno != ENODATA) && (errno != EBADF) ) - return (SLURM_ERROR); + if (!threadid) { + if ((job_killjid((jid_t) id, sig) < 0) + && (errno != ENODATA) && (errno != EBADF) ) + return (SLURM_ERROR); + } else if (sig == SIGKILL) { + /* job ended before it started */ + _end_container_thread(); + } else + error("Trying to send signal %d a container 0x%08lx " + "that hasn't had anything added to it yet", sig, id); return (SLURM_SUCCESS); } @@ -216,7 +223,9 @@ int proctrack_p_plugin_destroy(uint64_t id) debug("destroying 0x%08lx 0x%08lx", id, threadid); - job_waitjid((jid_t) id, &status, 0); + if (!threadid) + job_waitjid((jid_t) id, &status, 0); + /* Assume any error means job doesn't exist. Therefore, * return SUCCESS to slurmd so it doesn't retry continuously */ @@ -248,7 +257,8 @@ bool proctrack_p_plugin_has_pid (uint64_t cont_id, pid_t pid) int proctrack_p_plugin_wait(uint64_t id) { int status; - if (job_waitjid((jid_t) id, &status, 0) == (jid_t)-1) + + if (!threadid && job_waitjid((jid_t) id, &status, 0) == (jid_t)-1) return SLURM_ERROR; return SLURM_SUCCESS; -- GitLab