diff --git a/NEWS b/NEWS index 3435733b1efc5e0829eb46c938e34a2a3b7883ef..7e323aaaed032d2e2051f3e719110878ddc6a566 100644 --- a/NEWS +++ b/NEWS @@ -164,6 +164,13 @@ documents those changes that are of interest to users and administrators. to a insufficient priviledges. -- Add warning about libcurl-devel not being installed during configure. -- Streamline job purge by handling file deletion on a separate thread. + -- Always set RLIMIT_CORE to the maximum permitted for slurmd, to ensure + core files are created even on non-developer builds. + -- Fix --ntasks-per-core option/environment variable parsing to set + the requested value, instead of always setting one. + -- If trying to cancel a step that hasn't started yet for some reason return + a good return code. + -- Fix issue with sacctmgr show where user='' * Changes in Slurm 17.02.3 ========================== diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index 4ebf4ccbbbbdea421016ee3069e5b04216a8cdb3..c3e1fe8d67070460ac8e5f2db942baeeae672e42 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -62,6 +62,7 @@ <li><a href="#sbatch_srun">What is the difference between the sbatch and srun commands?</a></li> <li><a href="#squeue_color">Can squeue output be color coded?</a></li> +<li><a href="#x11">Can Slurm export an X11 display on an allocated compute node?</a></li> </ol> <h2>For Administrators</h2> @@ -1056,6 +1057,46 @@ regex white:default ^JOBID.* </pre> <img src="squeue_color.png" width=600> +<p><a name="x11"><b>33. Can Slurm export an X11 display on an allocated compute node?</b></a><br> +You will need to build and install an optional SPANK plugin for that functionality +today. The functionality should be provided natively in Slurm soon. Instructions +to build and install the plugin follow. Update the Slurm installation path as needed.</p> +<pre> +# Maybe obvious, but don't forget the -X on ssh +$ ssh -X alex@testserver.com + +# Get the plugin +$ mkdir git +$ cd git +$ git clone https://github.com/hautreux/slurm-spank-x11.git +$ cd slurm-spank-x11 + +# Manually edit the X11_LIBEXEC_PROG macro definition +$ vi slurm-spank-x11.c +$ vi slurm-spank-x11-plug.c +$ grep "define X11_" slurm-spank-x11.c +#define X11_LIBEXEC_PROG "/opt/slurm/17.02/libexec/slurm-spank-x11" +$ grep "define X11_LIBEXEC_PROG" slurm-spank-x11-plug.c +#define X11_LIBEXEC_PROG "/opt/slurm/17.02/libexec/slurm-spank-x11" + + +# Compile +$ gcc -g -o slurm-spank-x11 slurm-spank-x11.c +$ gcc -g -I/opt/slurm/17.02/include -shared -fPIC -o x11.so slurm-spank-x11-plug.c + +# Install +$ mkdir -p /opt/slurm/17.02/libexec +$ install -m 755 slurm-spank-x11 /opt/slurm/17.02/libexec +$ install -m 755 x11.so /opt/slurm/17.02/lib/slurm + +# Configure +$ echo -e "optional\tx11.so" >> /opt/slurm/17.02/etc/plugstack.conf +$ cd ~/tests + +# Run +$ srun -n1 --pty --x11 xclock +adam@node1's password: +</pre> <p class="footer"><a href="#top">top</a></p> @@ -2194,6 +2235,6 @@ slurmctld for those changes to be recognized. <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 12 May 2017</p> +<p style="text-align:center;">Last modified 6 June 2017</p> <!--#include virtual="footer.txt"--> diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 157a79feab37470f1926b9bbec1d134f75054233..fb815a3c38e011dc7f6073e104699dc4611c95dd 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -238,7 +238,7 @@ extern int slurm_char_list_copy(List dst, List src) /* returns number of objects added to list */ extern int slurm_addto_char_list(List char_list, char *names) { - int i = 0, start = 0; + int i = 0, start = 0, cnt = 0; char *name = NULL; ListIterator itr = NULL; char quote_c = '\0'; @@ -263,6 +263,7 @@ extern int slurm_addto_char_list(List char_list, char *names) i++; } start = i; + cnt = list_count(char_list); while (names[i]) { //info("got %d - %d = %d", i, start, i-start); if (quote && (names[i] == quote_c)) @@ -366,7 +367,8 @@ extern int slurm_addto_char_list(List char_list, char *names) i++; } - if (i-start) { + /* check for empty strings user='' etc */ + if ((cnt == list_count(char_list)) || (i - start)) { name = xstrndup(names+start, (i-start)); /* If we get a duplicate remove the * first one and tack this on the end. diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 9e653b9718ef23f2a5f3ac331f5cc91d624f4764..b88fc90e67101cd7b141c0a92653f6f1345b987d 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -1109,7 +1109,8 @@ static int _attempt_backfill(void) if (assoc_limit_stop) { assoc_mgr_lock(&qos_read_lock); - list_for_each(part_list, _clear_qos_blocked_times, NULL); + list_for_each(assoc_mgr_qos_list, + _clear_qos_blocked_times, NULL); assoc_mgr_unlock(&qos_read_lock); } diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 61d3f81c66459f2c1a4c60af96f6f6057c1f60ec..ad7856e72ce7c53a65feef24a4e3d5f2d8523d12 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -1856,7 +1856,7 @@ static void _set_options(int argc, char **argv) optarg, true); setenvf(NULL, "SLURM_NTASKS_PER_CORE", "%d", opt.ntasks_per_core); - opt.ntasks_per_core = true; + opt.ntasks_per_core_set = true; break; case LONG_OPT_HINT: if (!optarg) @@ -1871,7 +1871,7 @@ static void _set_options(int argc, char **argv) exit(error_exit); } opt.hint_set = true; - opt.ntasks_per_core = true; + opt.ntasks_per_core_set = true; opt.threads_per_core_set = true; break; case LONG_OPT_BLRTS_IMAGE: diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index c918852138d275460c9ce840b044f7162146fdb1..2026b6db4c9cfde1a125867707040144c155f8eb 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -2090,7 +2090,8 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t *msg, /* do RPC call */ /* First set node DOWN if fatal error */ - if ((comp_msg->slurm_rc == ESLURM_ALREADY_DONE) || + if ((comp_msg->slurm_rc == ESLURMD_JOB_NOTRUNNING) || + (comp_msg->slurm_rc == ESLURM_ALREADY_DONE) || (comp_msg->slurm_rc == ESLURMD_CREDENTIAL_REVOKED)) { /* race condition on job termination, not a real error */ info("slurmd error running JobId=%u from %s=%s: %s", diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index 95ff786f49718d1919729793990c3c8709ead69d..eda00ad142863b337ffa3713d672069e596f1de0 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -1559,12 +1559,10 @@ _slurmd_init(void) rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_NOFILE, &rlim); } -#ifndef NDEBUG if (getrlimit(RLIMIT_CORE, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_CORE, &rlim); } -#endif /* !NDEBUG */ /* * Create a context for verifying slurm job credentials diff --git a/src/slurmd/slurmstepd/step_terminate_monitor.c b/src/slurmd/slurmstepd/step_terminate_monitor.c index b043421a0ee9f5f38519b1089c3dc0350ff18bd8..464af54e18dfc976562e4ddc7e8b55aa1a8dbc7f 100644 --- a/src/slurmd/slurmstepd/step_terminate_monitor.c +++ b/src/slurmd/slurmstepd/step_terminate_monitor.c @@ -130,6 +130,8 @@ static void *_monitor(void *arg) if (rc == ETIMEDOUT) { char entity[24], time_str[24]; time_t now = time(NULL); + int rc; + _call_external_program(job); if (job->stepid == SLURM_BATCH_SCRIPT) { @@ -144,16 +146,17 @@ static void *_monitor(void *arg) } slurm_make_time_str(&now, time_str, sizeof(time_str)); - if (job->state != SLURMSTEPD_STEP_RUNNING) { + if (job->state < SLURMSTEPD_STEP_RUNNING) { error("*** %s STEPD TERMINATED ON %s AT %s DUE TO JOB NOT RUNNING ***", entity, job->node_name, time_str); + rc = ESLURMD_JOB_NOTRUNNING; } else { error("*** %s STEPD TERMINATED ON %s AT %s DUE TO JOB NOT ENDING WITH SIGNALS ***", entity, job->node_name, time_str); + rc = ESLURMD_KILL_TASK_FAILED; } - stepd_cleanup(NULL, job, NULL, NULL, SLURM_ERROR, 0); - abort(); + exit(stepd_cleanup(NULL, job, NULL, NULL, rc, 0)); } else if (rc != 0) { error("Error waiting on condition in _monitor: %m"); }