From 6728119fabdd79b3eab553a66a0de5596bc835ba Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mgrondona@llnl.gov>
Date: Thu, 2 Jul 2015 13:05:06 -0700
Subject: [PATCH] slurmd: return failure to signal job step if prolog is
 running

If the job prolog is running we can't send a signal to job step
tasks, so return SLURM_FAILURE instead of ESLURM_INVALID_JOB_ID.
This should cause the caller to retry, instead of assuming the
job step is not running on the node.
---
 src/slurmd/slurmd/req.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index 50e46dbde07..709eb04c676 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -2634,6 +2634,15 @@ _signal_jobstep(uint32_t jobid, uint32_t stepid, uid_t req_uid,
 	uid_t uid;
 	uint16_t protocol_version;
 
+	/*  There will be no stepd if the prolog is still running
+	 *   Return failure so caller can retry.
+	 */
+	if (_prolog_is_running (jobid)) {
+		info ("signal %d req for %u.%u while prolog is running."
+		      " Returning failure.", signal, jobid, stepid);
+		return SLURM_FAILURE;
+	}
+
 	fd = stepd_connect(conf->spooldir, conf->node_name, jobid, stepid,
 			   &protocol_version);
 	if (fd == -1) {
-- 
GitLab