diff --git a/NEWS b/NEWS index 5109f0ac9c1beee5206f480c9c49c1262676345f..86221765e1b4b2c668915d14aba7b5dbdf94d2a9 100644 --- a/NEWS +++ b/NEWS @@ -431,6 +431,7 @@ documents those changes that are of interest to users and administrators. -- Cray modulefile: avoid removing /usr/bin from path on module unload. -- Fix issue when resetting the partition pointers on nodes. -- Show reason field in 'sinfo -R' when nodes is marked as failed. + -- Fix potential of slurmstepd segfaulting when the extern step fails to start. * Changes in Slurm 17.02.9 ========================== diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index a790a01085f07ea976ff462ae4d87682d672e62c..5bd3c0c6f78d064fae4a48936762c671e4083d9b 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -2100,6 +2100,7 @@ static int _spawn_prolog_stepd(slurm_msg_t *msg) */ } else { hostset_t step_hset = hostset_create(req->nodes); + int rc; debug3("%s: call to _forkexec_slurmstepd", __func__); rc = _forkexec_slurmstepd(LAUNCH_TASKS, (void *)launch_req, @@ -2107,6 +2108,10 @@ static int _spawn_prolog_stepd(slurm_msg_t *msg) msg->protocol_version); debug3("%s: return from _forkexec_slurmstepd %d", __func__, rc); + + if (rc != SLURM_SUCCESS) + _launch_job_fail(req->job_id, rc); + if (step_hset) hostset_destroy(step_hset); } diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index b5e5ccfc18ccbbeda50df790ac8eb3185ff66e51..b63c9764bef64e9b1adb9b3441a23bb3913dc625 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -2378,6 +2378,18 @@ _send_launch_failure(launch_tasks_request_msg_t *msg, slurm_addr_t *cli, int rc, int nodeid; char *name = NULL; + /* + * The extern step can get here if something goes wrong starting the + * step. If this does happen we don't have to contact the srun since + * there isn't one, just return. + */ + if ((msg->job_step_id == SLURM_EXTERN_CONT) || + !msg->resp_port || !msg->num_resp_port) { + debug2("%s: The extern step has nothing to send a launch failure to", + __func__); + return; + } + #ifndef HAVE_FRONT_END nodeid = nodelist_find(msg->complete_nodelist, conf->node_name); name = xstrdup(conf->node_name);