diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 4f0981ba6897c5c05ef9c607487c54ac9a3bd355..152c373dbc3c31095abd17f7a10626266e5ce8a7 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -48,6 +48,16 @@ run one process, nodes with 2 cpus will be allocated 2 processes, etc. The distribution of processes across nodes may be controlled using this option along with the \fB\-n\fR and \fB\-c\fR options. .TP +\fB\-r\fR, \fB--relative\fR=\fIn\fR +Run a job step relative to node \fIn\fR of the current allocation. +This option may be used to spread several job steps out among the +nodes of the current job. If \fB-r\fR is used, the current job +step will begin at node \fIn\fR of the allocated nodelist, where +the first node is considered node 0. The \fB\-r\fR option is not +permitted along with \fB\-w\fR or \fB\-x\fR, and will be silently +ignored when not running within a prior allocation (i.e. when +SLURM_JOBID is not set). The default for \fIn\fR is 0. +.TP \fB\-p\fR, \fB\-\-partition\fR=\fIpartition\fR Request resources from partition "\fIpartition\fR." Partitions are created by the slurm administrator. @@ -489,6 +499,9 @@ SLURMD_DEBUG SLURM_DISTRIBUTION \fB\-m, \-\-distribution\fR=(\fIblock|cyclic\fR) .TP +SLURM_LABELIO +\fB-l, --label\fR +.TP SLURM_NNODES \fB\-N, \-\-nodes\fR=(\fIn|min-max\fR) .TP @@ -575,9 +588,22 @@ This simple example demonstrates the execution of the command \fBhostname\fR in eight tasks. At least eight processors will be allocated to the job (the same as the task count) on however many nodes are required to satisfy the request. The output of each task will be proceeded with its task number. -.br +(The machine "dev" in the example below has a total of two CPUs per node) + +.nf + > srun \-n8 \-l hostname +0: dev0 +1: dev0 +2: dev1 +3: dev1 +4: dev2 +5: dev2 +6: dev3 +7: dev3 +.fi +.PP This example demonstrates how one might submit a script for later execution (batch mode). The script will be initiated when resources are available and no higher priority job is pending for the same @@ -585,40 +611,87 @@ partition. The script will execute on 4 nodes with one task per node implicit. Note that the script executes on one node. For the script to utilize all allocated nodes, it must execute the \fBsrun\fR command or an MPI program. -.br -> cat my_script -.br -#!/bin/csh -.br + +.nf + +> cat test.sh +#!/bin/sh date -.br srun \-l hostname -.br -> srun \-N4 \-b my_script +> srun \-N4 \-b test.sh +srun: jobid 42 submitted + +.fi +.PP +The output of test.sh would be found in the default output file +"slurm-42.out." +.PP +The srun \fB-r\fR option is used within a job script +to run two job steps on disjoint nodes in the following +example. The script is run using allocate mode instead +of as a batch job in this case. + +.nf + +> cat test.sh +#!/bin/sh +echo $SLURM_NODELIST +srun -lN2 -r2 hostname +srun -lN2 hostname +> srun -A -N4 test.sh +dev[7-10] +0: dev9 +1: dev10 +0: dev7 +1: dev8 + +.fi +.PP +The follwing script runs two job steps in parallel +within an allocated set of nodes. + +.nf + +> cat test.sh +#!/bin/bash +srun -lN2 -n4 -r 2 sleep 60 & +srun -lN2 -r 0 sleep 60 & +sleep 1 +squeue +squeue -s +wait + +> srun -A -N4 test.sh + JOBID PARTITION NAME USER ST TIME NODES NODELIST + 65641 batch test.sh grondo R 0:01 4 dev[7-10] + +STEPID PARTITION USER TIME NODELIST +65641.0 batch grondo 0:01 dev[7-8] +65641.1 batch grondo 0:01 dev[9-10] + +.fi +.PP This example demonstrates how one executes a simple MPICH job in the event that MPICH has not been configured to automatically set the required parameters (this is the worst cases scenario). We use \fBsrun\fR to build a list of machines (nodes) to be used by \fBmpirun\fR in its required format. A sample command line and the script to be executed follow. -.br -> cat my_script -#!/bin/csh -.br -srun /bin/hostname >nodes -.br -mpirun \-np $SLURM_NPROCS \-machinefile nodes /bin/hostname -.br -rm node_list -.br -> srun \-N2 \-n4 my_script -If MPICH is configured to directly use SLURM, the execute line is -the much simpler: -.br -> mpirun \-np 4 /bin/hostname +.nf + +> cat test.sh +#!/bin/sh +PROCFILE="nodes.$SLURM_JOBID" +srun -o $PROCFILE /bin/hostname +mpirun -np $SLURM_NPROCS -machinefile $PROCFILE /bin/hostname +rm $PROCFILE + +> srun -AN2 -n4 test.sh + +.fi .SH "BUGS" If the number of processors per node allocated to a job is not evenly diff --git a/src/srun/job.c b/src/srun/job.c index 68cfd17be1dd0a5a6f42ebf567e80a4381ee88d4..e802b599e5cb25040da93da1a5a920a30f49987c 100644 --- a/src/srun/job.c +++ b/src/srun/job.c @@ -47,6 +47,7 @@ #include "src/srun/job.h" #include "src/srun/opt.h" #include "src/srun/fname.h" +#include "src/srun/signals.h" #if HAVE_TOTALVIEW #include "src/srun/attach.h" @@ -100,21 +101,21 @@ job_t * job_create_allocation(resource_allocation_response_msg_t *resp) { job_t *job; - allocation_info_t *info = xmalloc(sizeof(*info)); + allocation_info_t *i = xmalloc(sizeof(*i)); - info->nodelist = _normalize_hostlist(resp->node_list); - info->nnodes = resp->node_cnt; - info->jobid = resp->job_id; - info->stepid = NO_VAL; - info->num_cpu_groups = resp->num_cpu_groups; - info->cpus_per_node = resp->cpus_per_node; - info->cpu_count_reps = resp->cpu_count_reps; - info->addrs = resp->node_addr; + i->nodelist = _normalize_hostlist(resp->node_list); + i->nnodes = resp->node_cnt; + i->jobid = resp->job_id; + i->stepid = NO_VAL; + i->num_cpu_groups = resp->num_cpu_groups; + i->cpus_per_node = resp->cpus_per_node; + i->cpu_count_reps = resp->cpu_count_reps; + i->addrs = resp->node_addr; - job = _job_create_internal(info); + job = _job_create_internal(i); - xfree(info->nodelist); - xfree(info); + xfree(i->nodelist); + xfree(i); return (job); } @@ -253,7 +254,8 @@ job_destroy(job_t *job, int error) debug("cancelling job %u", job->jobid); slurm_complete_job(job->jobid, 0, error); } else { - debug("no allocation to cancel"); + debug("no allocation to cancel, killing remote tasks"); + fwd_signal(job, SIGKILL); return; } @@ -545,6 +547,148 @@ _host_state_name(host_state_t state_inx) } } + +/* + * Returns the first integer pushed onto the hostlist hl. + * Returns -2 when hostlist is empty, -1 if strtoul fails. + */ +static int +_hostlist_shift_int(hostlist_t hl) +{ + char *str = hostlist_shift(hl); + char *p = NULL; + unsigned long n; + + if (!str) return (-2); + + n = strtoul(str, &p, 10); + if ((n < 0) || (*p != '\0')) { + free(str); + return -1; + } + + free(str); + + return ((int) n); +} + +/* + * Returns a ranged string representation of hostlist hl + * string is allocated with xmalloc() and must be freed with xfree() + */ +static char * +_hostlist_string_create(hostlist_t hl) +{ + int len = 4096; + char *buf = xmalloc(len*sizeof(char)); + + while (hostlist_ranged_string(hl, len, buf) < 0) + xrealloc(buf, (len+=4096)*sizeof(char)); + + return buf; +} + +/* + * Applies the setting of opt.relative to the hostlist given + * + */ +static char * +_relative_hosts(hostlist_t hl) +{ + int n = 0; + hostlist_t rl, rlist; + char *relnodes = NULL; + + xassert (opt.relative); + + if (!(rl = hostlist_create(opt.relative))) + return NULL; + + rlist = hostlist_create(NULL); + + if (hostlist_count(rl) == 1) { + int i; + int origin = _hostlist_shift_int(rl); + int horizon = MIN(opt.min_nodes, hostlist_count(hl)); + + for (i = 0; i < horizon; i++) { + char *host = hostlist_nth(hl, i+origin); + hostlist_push_host(rlist, host); + free (host); + } + + goto done; + } + + while ((n = _hostlist_shift_int(rl)) > -2) { + char *host; + + if (n < 0) { + hostlist_destroy(rlist); + hostlist_destroy(rl); + return NULL; + } + + host = hostlist_nth(hl, n); + hostlist_push_host(rlist, host); + free (host); + } + + done: + relnodes = _hostlist_string_create(rlist); + + /* + * Reset min nodes to the minimum of the new count of available + * hosts and the existing value. This means that requesting + * relative nodes is, in effect, deselecting nodes outside + * the relative set. + * + * This will allow proper srun options to fall naturally + * out of use of the relative option. + */ + opt.min_nodes = MIN(opt.min_nodes, hostlist_count(rlist)); + + hostlist_destroy(rlist); + hostlist_destroy(rl); + return relnodes; +} + +/* + * Apply the user option -r, --relative to the allocation response. + * Exits program on error parsing relative option. + * + */ +static void +_apply_relative_option(resource_allocation_response_msg_t *resp, + bitstr_t *reqbits) +{ + bitstr_t *relbits = NULL; + char *relnodes = NULL; + hostlist_t hl = NULL; + + if (!opt.relative) + return; + + hl = hostlist_create(resp->node_list); + + if (!(relnodes = _relative_hosts(hl))) { + error ("Bad argument to -r,--relative: `%s'", opt.relative); + exit (1); + } + + relbits = bit_alloc(resp->node_cnt); + + _job_resp_bitmap(hl, relnodes, reqbits); + _job_resp_hack(resp, reqbits); + + hostlist_destroy (hl); + xfree (relnodes); + bit_free (relbits); + + return; +} + + /* The below functions are used to support job steps *\ \* with different allocations than the parent job. */ int job_resp_hack_for_step(resource_allocation_response_msg_t *resp) @@ -554,6 +698,13 @@ int job_resp_hack_for_step(resource_allocation_response_msg_t *resp) int return_code = 0, total; req_bitmap = bit_alloc(resp->node_cnt); + exc_bitmap = bit_alloc(resp->node_cnt); + + /* + * Apply -r, --relative option first + */ + _apply_relative_option(resp, req_bitmap); + if (opt.nodelist && _job_resp_bitmap(resp_nodes, opt.nodelist, req_bitmap)) { error("Required nodes (%s) missing from job's allocation (%s)", @@ -562,7 +713,6 @@ int job_resp_hack_for_step(resource_allocation_response_msg_t *resp) goto cleanup; } - exc_bitmap = bit_alloc(resp->node_cnt); if (opt.exc_nodes) { bitstr_t *tmp_bitmap; int overlap; @@ -572,8 +722,9 @@ int job_resp_hack_for_step(resource_allocation_response_msg_t *resp) overlap = bit_set_count(tmp_bitmap); bit_free(tmp_bitmap); if (overlap > 0) { - error("Duplicates in hostlist (%s) and exclude list (%s)", - opt.nodelist, opt.exc_nodes); + error("Duplicates in hostlist (%s) " + "and exclude list (%s)", + opt.nodelist, opt.exc_nodes); return_code = 1; goto cleanup; } @@ -587,7 +738,7 @@ int job_resp_hack_for_step(resource_allocation_response_msg_t *resp) opt.min_nodes, total); return_code = 1; goto cleanup; - } + } } if (total != resp->node_cnt) @@ -736,8 +887,9 @@ _job_resp_hack(resource_allocation_response_msg_t *resp, bitstr_t *req_bitmap) memcpy(new_node_addr+new_inx, resp->node_addr+old_inx, sizeof(slurm_addr)); - new_cpus_per_node[new_inx] = _job_resp_cpus( - resp->cpus_per_node, resp->cpu_count_reps, old_inx); + new_cpus_per_node[new_inx] = + _job_resp_cpus(resp->cpus_per_node, + resp->cpu_count_reps, old_inx); new_cpu_count_reps[new_inx] = 1; new_inx++; } diff --git a/src/srun/opt.c b/src/srun/opt.c index bfac3a79cdb9edf82ee07609fed2f2befd60462b..0e4d8ed3d77e9ba8b9352bdeb5066303446cadc9 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -105,6 +105,7 @@ #define OPT_WAIT 0x19 #define OPT_OVERCOMMIT 0x1a #define OPT_HOLD 0x1b +#define OPT_RELATIVE 0x1c /* constraint type options */ #define OPT_MINCPUS 0x50 @@ -181,8 +182,11 @@ struct poptOption runTable[] = { "number of cpus required per task", "ncpus"}, {"nodes", 'N', POPT_ARG_STRING, 0, OPT_NODES, - "number of nodes on which to run (nnodes = count|min-max)", + "number of nodes on which to run (nnodes = min[-max])", "nnodes"}, + {"relative", 'r', POPT_ARG_STRING, &opt.relative, OPT_RELATIVE, + "run job step relative to node n of allocation", + "n"}, {"partition", 'p', POPT_ARG_STRING, &opt.partition, OPT_PARTITION, "partition requested", "partition"}, @@ -626,12 +630,12 @@ struct env_vars { }; env_vars_t env_vars[] = { - {"SLURMD_DEBUG", OPT_INT, &opt.slurmd_debug, NULL }, {"SLURM_NPROCS", OPT_INT, &opt.nprocs, &opt.nprocs_set}, {"SLURM_CPUS_PER_TASK", OPT_INT, &opt.cpus_per_task, &opt.cpus_set }, {"SLURM_PARTITION", OPT_STRING, &opt.partition, NULL }, {"SLURM_IMMEDIATE", OPT_INT, &opt.immediate, NULL }, {"SLURM_DEBUG", OPT_DEBUG, NULL, NULL }, + {"SLURMD_DEBUG", OPT_INT, &opt.slurmd_debug, NULL }, {"SLURM_NNODES", OPT_NODES, NULL, NULL }, {"SLURM_OVERCOMMIT", OPT_OVERCOMMIT, NULL, NULL }, {"SLURM_DISTRIBUTION", OPT_DISTRIB, NULL, NULL }, @@ -948,7 +952,6 @@ _opt_verify(poptContext optctx) { bool verified = true; - if (opt.slurmd_debug + LOG_LEVEL_ERROR > LOG_LEVEL_DEBUG2) opt.slurmd_debug = LOG_LEVEL_DEBUG2 - LOG_LEVEL_ERROR; @@ -962,6 +965,17 @@ _opt_verify(poptContext optctx) verified = false; } + if (opt.no_alloc && opt.relative) { + error("do not specify -r,--relative with -Z,--no-allocate."); + verified = false; + } + + if (opt.relative && (opt.exc_nodes || opt.nodelist)) { + error("-r,--relative not allowed with " + "-w,--nodelist or -x,--exclude."); + verified = false; + } + if (opt.mincpus < opt.cpus_per_task) opt.mincpus = opt.cpus_per_task; @@ -1017,15 +1031,18 @@ _opt_verify(poptContext optctx) } else if (opt.nodes_set && opt.nprocs_set) { - /* make sure # of procs >= min_nodes */ + /* + * make sure # of procs >= min_nodes + */ if (opt.nprocs < opt.min_nodes) { - error("Warning: can't run %d processes on %d " + + info ("Warning: can't run %d processes on %d " "nodes, setting nnodes to %d", - opt.nprocs, opt.min_nodes, - opt.min_nodes); + opt.nprocs, opt.min_nodes, opt.nprocs); + opt.min_nodes = opt.nprocs; - if (opt.max_nodes && - (opt.min_nodes > opt.max_nodes)) + if ( opt.max_nodes + && (opt.min_nodes > opt.max_nodes) ) opt.max_nodes = opt.min_nodes; } diff --git a/src/srun/opt.h b/src/srun/opt.h index b17aac9da3ad48e1847e795b71a38c70e8ba8b0b..84c54567f0ebcab29dc4dc296770ea9c8198740d 100644 --- a/src/srun/opt.h +++ b/src/srun/opt.h @@ -147,6 +147,7 @@ typedef struct srun_options { bool contiguous; /* --contiguous */ char *nodelist; /* --nodelist=node1,node2,... */ char *exc_nodes; /* --exclude=node1,node2,... -x */ + char *relative; /* --relative -r N */ bool no_alloc; /* --no-allocate, -Z */ int max_launch_time; /* Undocumented */ int max_exit_timeout; /* Undocumented */ diff --git a/src/srun/srun.c b/src/srun/srun.c index 431c8bc96b7fbc3b515b3266975fc5088982c89e..2b6fefe1884e81a67af4b8401ea230b8b31254c9 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -215,13 +215,12 @@ int main(int ac, char **av) /* job is now overdone, clean up * - * If job "failed" send SIGKILL to any remaining tasks. * If job is "forcefully terminated" exit immediately. * */ if (job->state == SRUN_JOB_FAILED) { info("Terminating job"); - fwd_signal(job, SIGKILL); + job_destroy(job, 0); } else if (job->state == SRUN_JOB_FORCETERM) { job_destroy(job, 0); exit(1);