diff --git a/NEWS b/NEWS index 56e048ab9ffea23dfde6d8296d907e91fafc1f43..a9025ee23e988143b047ac2fc0d5d889761d4747 100644 --- a/NEWS +++ b/NEWS @@ -296,6 +296,8 @@ documents those changes that are of interest to users and administrators. restart or reconfigure (bug was introduced in 14.03.5 "Clear record of a job's gres when requeued" and only applies when GRES mapped to specific files). + -- BGQ: Fix race condition when job fails due to hardware failure and is + requeued. Previous code could result in slurmctld abort with NULL pointer. * Changes in Slurm 14.03.9 ========================== diff --git a/doc/html/quickstart_admin.shtml b/doc/html/quickstart_admin.shtml index cb48247f64cd0d1add842eb50b1ccc52bb63b55c..4ffbc9a9811204f4783fc6ddd97e7477694ec45f 100644 --- a/doc/html/quickstart_admin.shtml +++ b/doc/html/quickstart_admin.shtml @@ -96,10 +96,11 @@ and commands are denoted below. Please see the <a href=download.html>Download</a> page for references to required software to build these plugins.</p> -<p>To build RPMs directly, copy the distributed tar-ball into the directory -<b>/usr/src/redhat/SOURCES</b> and execute a command of this sort (substitute -the appropriate Slurm version number):<br> -<span class="commandline">rpmbuild -ta slurm-0.6.0-1.tar.bz2</span></p> +<p>To build RPMs directly, copy the distributed tar-ball into a directory +and execute a command of this sort (substitute the appropriate Slurm version number):<br> +<span class="commandline">rpmbuild -ta slurm-14.03.9.tar.bz2</span></p> +The rpm file will be installed under the <b>$(HOME)/rpmbuild</b> directory +of the user building them. <p>You can control some aspects of the RPM built with a <i>.rpmmacros</i> file in your home directory. <b>Special macro definitions will likely diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 82a0f4f34f33820986e79a8aba2272910acfae4b..8c05a589438c3771b6a5cb8080bc4d25af157844 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -330,7 +330,7 @@ user environment variables (for Moab spawned jobs), or if the slurmd daemon gets paged from memory. .br .br -\fBNote\fR: The test for a job being succesfully launched is only performed when +\fBNote\fR: The test for a job being successfully launched is only performed when the Slurm daemon on the compute node registers state with the slurmctld daemon on the head node, which happens fairly rarely. Therefore a job will not necessarily be terminated if its start time exceeds diff --git a/src/plugins/select/bluegene/bg_core.c b/src/plugins/select/bluegene/bg_core.c index 2cf59a6e6e258fd7f7bf5c9962fc2b4beddb2c0a..73f0331a75f8714b8f0b4e3fc31e3c6dda71c035 100644 --- a/src/plugins/select/bluegene/bg_core.c +++ b/src/plugins/select/bluegene/bg_core.c @@ -337,9 +337,12 @@ extern void bg_requeue_job(uint32_t job_id, bool wait_for_start, if (!slurmctld_locked) lock_slurmctld(job_write_lock); - if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, preempted, 0))) { - error("Couldn't requeue job %u, failing it: %s", - job_id, slurm_strerror(rc)); + rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, preempted, 0); + if (rc == ESLURM_JOB_PENDING) { + error("%s: Could not requeue pending job %u", __func__, job_id); + } else if (rc != SLURM_SUCCESS) { + error("%s: Could not requeue job %u, failing it: %s", + __func__, job_id, slurm_strerror(rc)); job_fail(job_id, job_state); } if (!slurmctld_locked) diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 102325449bcf22e026b65d0dcef4d0707ece28ce..337d76d3c1a84dadfd40b9b24bb8a29242e3996c 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -484,16 +484,20 @@ int job_step_signal(uint32_t job_id, uint32_t step_id, step_ptr = find_step_record(job_ptr, step_id); if (step_ptr == NULL) { if (signal != SIG_NODE_FAIL) { - rc = ESLURM_INVALID_JOB_ID; info("job_step_signal step %u.%u not found", job_id, step_id); - return rc; + return ESLURM_INVALID_JOB_ID; } - /* If we get a node fail signal we need to process it - since we will create a race condition otherwise - where jobs could be started on these nodes and - fail. - */ + if (job_ptr->nodes_completing == NULL) { + /* Job state has already been cleared for requeue. + * Rely upon real-time server to put cnodes in error + * state. */ + info("%s: job %u already requeued, can not down cnodes", + __func__, job_id); + return ESLURM_ALREADY_DONE; + } + /* If we get a node fail signal, down the cnodes to avoid + * allocating them to another job. */ debug("job_step_signal step %u.%u not found, but got " "SIG_NODE_FAIL, so failing all nodes in allocation.", job_id, step_id);