From 6728119fabdd79b3eab553a66a0de5596bc835ba Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" <mgrondona@llnl.gov> Date: Thu, 2 Jul 2015 13:05:06 -0700 Subject: [PATCH] slurmd: return failure to signal job step if prolog is running If the job prolog is running we can't send a signal to job step tasks, so return SLURM_FAILURE instead of ESLURM_INVALID_JOB_ID. This should cause the caller to retry, instead of assuming the job step is not running on the node. --- src/slurmd/slurmd/req.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 50e46dbde07..709eb04c676 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -2634,6 +2634,15 @@ _signal_jobstep(uint32_t jobid, uint32_t stepid, uid_t req_uid, uid_t uid; uint16_t protocol_version; + /* There will be no stepd if the prolog is still running + * Return failure so caller can retry. + */ + if (_prolog_is_running (jobid)) { + info ("signal %d req for %u.%u while prolog is running." + " Returning failure.", signal, jobid, stepid); + return SLURM_FAILURE; + } + fd = stepd_connect(conf->spooldir, conf->node_name, jobid, stepid, &protocol_version); if (fd == -1) { -- GitLab