diff --git a/NEWS b/NEWS index 83b93b6fb7efd4395b76f6677b075d5615f677ad..01aa90c714680a95a18dd0dbf5f0997646b3c9da 100644 --- a/NEWS +++ b/NEWS @@ -195,6 +195,8 @@ documents those changes that are of interest to users and admins. -- Try to load libslurm.so only when necessary. -- When nodes scheduled for reboot, set state to DOWN rather than FUTURE so they are still visible to sinfo. State set to IDLE after reboot completes. + -- Apply BatchStartTimeout configuration to task launch and avoid aborting + srun commands due to long running Prolog scripts. * Changes in Slurm 14.03.6 ========================== diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index dece5a4c6fbe3fcdd87b32f5e5920d91f00c2c6d..d3439e8aab7ededb33773c19d2756420053301fc 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -324,6 +324,15 @@ allocation. The default value is 10 (seconds). Larger values may be required if more time is required to execute the \fBProlog\fR, load user environment variables (for Moab spawned jobs), or if the slurmd daemon gets paged from memory. +.br +.br +\fBNote\fR: The test for a job being succesfully launched is only performed when +the Slurm daemon on the compute node registers state with the slurmctld daemon +on the head node, which happens fairly rarely. +Therefore a job will not necessarily be terminated if its start time exceeds +\fBBatchStartTimeout\fR. +This configuration parameter is also applied to launch tasks and avoid aborting +\fBsrun\fR commands due to long running \fBProlog\fR scripts. .TP \fBCacheGroups\fR diff --git a/src/api/step_launch.c b/src/api/step_launch.c index 7e9a373763b63c1d3074b0824f9bd8d359d30625..433a943aad9177b55e038c9afaedc241c79de641 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -1596,6 +1596,13 @@ static int _launch_tasks(slurm_step_ctx_t *ctx, hostlist_destroy(hl); } + /* Extend timeout based upon BatchStartTime to permit for a long + * running Prolog */ + if (timeout <= 0) { + timeout = (slurm_get_msg_timeout() + + slurm_get_batch_start_timeout()) * 1000; + } + slurm_msg_t_init(&msg); msg.msg_type = REQUEST_LAUNCH_TASKS; msg.data = launch_msg;