From 9d7a76c94bdc2855c3f99704c066c1432b0b422c Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 30 Jul 2007 18:35:44 +0000 Subject: [PATCH] svn merge -r11873:11892 https://eris.llnl.gov/svn/slurm/branches/slurm-1.2 --- NEWS | 1 + doc/html/faq.shtml | 81 ++++++++++++++++++++++++++++++++++++++++- doc/html/team.shtml | 3 +- src/common/stepd_api.c | 5 ++- src/scontrol/info_job.c | 13 ++++--- src/srun/opt.c | 8 ++-- 6 files changed, 97 insertions(+), 14 deletions(-) diff --git a/NEWS b/NEWS index 8662766d220..da481bf465c 100644 --- a/NEWS +++ b/NEWS @@ -26,6 +26,7 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.2.13 ========================= -- Add slurm.conf parameter JobFileAppend. + -- Fix for segv in "scontrol listpids" on nodes not in SLURM config. * Changes in SLURM 1.2.12 ========================= diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml index c24c321ea07..7d82e817186 100644 --- a/doc/html/faq.shtml +++ b/doc/html/faq.shtml @@ -22,6 +22,7 @@ job?</a></li> name for a batch job?</a></li> <li><a href="#parallel_make">Can the <i>make</i> command utilize the resources allocated to a SLURM job?</a></li> +<li><a href="#terminal">Can tasks be launched with a remote terminal?</a></li> </ol> <h2>For Administrators</h2> <ol> @@ -340,6 +341,84 @@ overhead of SLURM's task launch. Use with make's <i>-j</i> option within an existing SLURM allocation. Outside of a SLURM allocation, make's behavior will be unchanged.</p> +<p><a name="terminal"><b>16. Can tasks be launched with a remote +terminal?</b></a><br> +SLURM does not directly support a remote pseudo terminal for spawned +tasks. +We intend to remedy this in Slurm version 1.3. +Until then, you can accomplish this by starting an appropriate program +or script. In the simplest case (X11 over TCP with the DISPLAY +environment already set), <i>srun xterm</i> may suffice. In the more +general case, the following scripts should work. +<b>NOTE: The pathname to the second script is included in the variable +BS of the first script. You must change this in the first script.</b> +Execute the script with the sbatch options desired. +For example, <i>interactive -N2 -pdebug</i>. + +<pre> +#!/bin/bash +# -*- coding: utf-8 -*- +# Author: Pär Andersson (National Supercomputer Centre, Sweden) +# Version: 0.2 2007-07-30 +# +# This will submit a batch script that starts screen on a node. +# Then ssh is used to connect to the node and attach the screen. +# The result is very similar to an interactive shell in PBS +# (qsub -I) + +# Batch Script that starts SCREEN +BS=/PATH_TO_BATCH_SCRIPT/_interactive + +# Submit the job and get the job id +JOB=`sbatch -output=/dev/null -error=/dev/null $@ $BS 2>&1 \ + | egrep -o -e "\b[0-9]+$"` + +# Make sure the job is always canceled +trap "{ /usr/bin/scancel -q $JOB; exit; }" SIGINT SIGTERM EXIT + +echo "Waiting for JOBID $JOB to start" +while true;do + sleep 5s + + # Check job status + STATUS=`squeue -j $JOB -t PD,R -h -o %t` + + if [ "$STATUS" = "R" ];then + # Job is running, break the while loop + break + elif [ "$STATUS" != "PD" ];then + echo "Job is not Running or Pending. Aborting" + scancel $JOB + exit 1 + fi + + echo -n "." + +done + +# Determine the first node in the job: +NODE=`srun --jobid=$JOB -N1 hostname` + +# SSH to the node and attach the screen +sleep 1s +ssh -t $NODE screen -rd -S slurm$JOB +# The trap will now cancel the job before exiting. +</pre> + +<p>NOTE: The above script executes the script below, +named <i>_interactive<i>.</p> +<pre> +#!/bin/sh +# -*- coding: utf-8 -*- +# Author: Pär Andersson (National Supercomputer Centre, Sweden) +# Version: 0.2 2007-07-30 +# +# Simple batch script that starts SCREEN. + +exec screen -Dm -S slurm$SLURM_JOBID +</pre> + + <p class="footer"><a href="#top">top</a></p> <h2>For Administrators</h2> @@ -711,6 +790,6 @@ about these options. <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 15 June 2007</p> +<p style="text-align:center;">Last modified 30 July 2007</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/team.shtml b/doc/html/team.shtml index d3946e6fa5d..ab90d33f1b9 100644 --- a/doc/html/team.shtml +++ b/doc/html/team.shtml @@ -18,6 +18,7 @@ <p> SLURM contributers include: </p> <ul> <li>Amjad Majid Ali (Colorado State University)</li> +<li>Pär Andersson (National Supercomputer Centre, Sweden)</li> <li>Don Albert (Bull)</li> <li>Ernest Artiaga (Barcelona Supercomputer Center, Spain)</li> <li>Anton Blanchard (Samba)</li> @@ -55,6 +56,6 @@ Networking, Italy)</li> <li>Anne-Marie Wunderlin (Bull)</li> </ul> -<p style="text-align:center;">Last modified 9 July 2007</p> +<p style="text-align:center;">Last modified 26 July 2007</p> <!--#include virtual="footer.txt"--> diff --git a/src/common/stepd_api.c b/src/common/stepd_api.c index 8f8fafae1a6..faebb0d4974 100644 --- a/src/common/stepd_api.c +++ b/src/common/stepd_api.c @@ -162,7 +162,7 @@ _guess_nodename() char host[256]; char *nodename = NULL; - if (gethostname_short(host, 256) != 0) + if (gethostname_short(host, 256) != 0) return NULL; nodename = slurm_conf_get_nodename(host); @@ -483,7 +483,8 @@ stepd_available(const char *directory, const char *nodename) struct stat stat_buf; if (nodename == NULL) { - nodename = _guess_nodename(); + if (!(nodename = _guess_nodename())) + return NULL; } if (directory == NULL) { slurm_ctl_conf_t *cf; diff --git a/src/scontrol/info_job.c b/src/scontrol/info_job.c index 7aa06b49156..fc99620c993 100644 --- a/src/scontrol/info_job.c +++ b/src/scontrol/info_job.c @@ -499,10 +499,10 @@ _list_pids_all_steps(const char *node_name, uint32_t jobid) int count = 0; steps = stepd_available(NULL, node_name); - if (list_count(steps) == 0) { - fprintf(stderr, "Job %u does not exist on this node.\n", - jobid); - list_destroy(steps); + if (!steps || list_count(steps) == 0) { + fprintf(stderr, "Job %u does not exist on this node.\n", jobid); + if (steps) + list_destroy(steps); exit_code = 1; return; } @@ -533,9 +533,10 @@ _list_pids_all_jobs(const char *node_name) step_loc_t *stepd; steps = stepd_available(NULL, node_name); - if (list_count(steps) == 0) { + if (!steps || list_count(steps) == 0) { fprintf(stderr, "No job steps exist on this node.\n"); - list_destroy(steps); + if (steps) + list_destroy(steps); exit_code = 1; return; } diff --git a/src/srun/opt.c b/src/srun/opt.c index 8667fdcf32f..b5b9717d283 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -2693,11 +2693,11 @@ static void _help(void) "\n" "Affinity/Multi-core options: (when the task/affinity plugin is enabled)\n" " -B --extra-node-info=S[:C[:T]] Expands to:\n" -" --sockets-per-node=S number of sockets per node to allocate\n" -" --cores-per-socket=C number of cores per socket to allocate\n" -" --threads-per-core=T number of threads per core to allocate\n" +" --sockets-per-node=S number of sockets per node to allocate\n" +" --cores-per-socket=C number of cores per socket to allocate\n" +" --threads-per-core=T number of threads per core to allocate\n" " each field can be 'min[-max]' or wildcard '*'\n" -" total cpus requested = (N x S x C x T)\n" +" total cpus requested = (N x S x C x T)\n" "\n" " --ntasks-per-socket=n number of tasks to invoke on each socket\n" " --ntasks-per-core=n number of tasks to invoke on each core\n" -- GitLab