diff --git a/NEWS b/NEWS index 85de0fa08dd7fd4094f3e8c7e1b89647ac1dba83..5f5053a30e47885144d04a0faf610b682b6ee4ef 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,10 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.3.0-pre1 ============================= + -- !!! SRUN CHANGES !!! + The srun options -A/--allocate, -b/--batch, and -a/--attach have been + removed! That functionality is now available in the separate commands + salloc, sbatch, and sattach, respectively. -- Add new node state FAILING plus trigger for when node enters that state. -- Add new configuration paramter "PrivateData". This can be used to prevent a user from seeing jobs or job steps belonging to other users. diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 4c295defe50d60ee5aab6a414b903a6a78373213..e625de75ce7618de9c7e7784f891ec7e01295f03 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -7,17 +7,10 @@ srun \- run parallel jobs .SH SYNOPSIS \fBsrun\fR [\fIOPTIONS\fR...] \fIexecutable \fR[\fIargs\fR...] -.br -\fBsrun\fR \-\-batch [\fIOPTIONS\fR...] job_script \fR[\fIargs\fR...] -.br -\fBsrun\fR \-\-allocate [\fIOPTIONS\fR...] [job_script \fR[\fIargs\fR...]] -.br -.B srun -\-\-attach=jobid .SH DESCRIPTION -Allocate resources and optionally initiate parallel jobs on -clusters managed by SLURM. +Run a parallel job on cluster managed by SLURM. If necessary, srun will +first create a resource allocation in which to run the parallel job. .SH "OPTIONS" .LP @@ -50,43 +43,6 @@ configured. If select/cons_res is configured, it must have a parameter of CR_Core, CR_Core_Memory, CR_Socket, or CR_Socket_Memory. -.TP -\fB\-b\fR, \fB\-\-batch\fR -Submit in "batch mode." \fBsrun\fR will make a copy of the \fIexecutable\fR -file (a script) and submit the request for execution when resouces are -available. \fBsrun\fR will terminate after the request has been submitted. -The \fIexecutable\fR file will run on the first node allocated to the -job and must contain \fBsrun\fR commands to initiate parallel tasks. -stdin will be redirected from /dev/null, stdout and stderr will be -redirected to a file (default is \fIjobname\fR.out or \fIjobid\fR.out in -current working directory, see \fB\-o\fR for other IO options). -Note that if the slurm daemons are cold\-started, jobid values will be -reused. Plan accordingly to avoid over\-writing output and error files. -\fIexecutable\fR must be specified using either a fully qualified -pathname or its pathname will be relative to the current working directory. -The search path will not be used to locate the file. \fIexecutable\fR -will be interpreted by the users default shell unless the file begins -with "#!" followed by the fully qualified pathname of a valid shell. -Note that batch jobs will be re\-queued if a node fails while it is being -initiated. - -Srun commandline options can also be inserted into the script by prefacing -the option with #SLURM. Multiple options can be on one line or multiple lines. -For example: - -.nf - #SLURM \-N 2 \-n 2 - #SLURM \-\-mpi=lam -.fi - -This is run the script on 2 nodes, with 2 procs with mpi type lam. -All commandline options are able to be set inside the script with the -exception of the mode (which has already been set since to run a batch -script you are in batch mode). -.br -Options on the command line take precedence over options in the batch -script, which in turn take precedence over exiting environmement variables. - .TP \fB\-\-begin\fR=\fItime\fR Defer initiation of this job until the specified time. @@ -273,18 +229,6 @@ Dedicate whole nodes to the job rather than individual processors even if consumable resources are enabled (e.g. \fBSelectType=select/cons_res\fR). -.TP -\fB\-\-get\-user\-env\fR -For a batch script submission, this option will tell srun to retrieve the -login environment variables for the user specified in the \-\-uid option. -The environment variables are retrieved by running "su - <username> -c -/usr/bin/env" and parsing the output. Be aware that any environment -variables already set in srun's environment will take precedence over any -environment variables in the user's login environment. -NOTE: This option only works if the caller has an -effective uid of "root", and only takes effect in batch mode (\-b/\-\-batch). -This option was originally created for use by Moab. - .TP \fB\-\-gid\fR=\fIgroup\fR If \fBsrun\fR is run as root, and the \fB\-\-gid\fR option is used, @@ -361,9 +305,7 @@ not for the submission of individual job steps. The job will assume all responsibilities for fault\-tolerance. The active job step (MPI job) will almost certainly suffer a fatal error, but subsequent job steps may be run if this option is specified. The -default action is to terminate job upon node failure. Note that -\fB\-\-batch\fR jobs will be re\-queued if a node failure occurs in the -process of initiating it. +default action is to terminate job upon node failure. .TP \fB\-l\fR, \fB\-\-label\fR @@ -635,14 +577,6 @@ to 10000 (lowest priority). Only privileged users can specify a negative adjustment. NOTE: This option is presently ignored if \fISchedulerType=sched/maui\fR. -.TP -\fB\-\-no\-requeue\fR -Specifies that the batch job is not requeue. -Setting this option will prevent system administrators from being able -to restart the job (for example, after a scheduled downtime). -When a job is requeued, the batch script is initiated from its beginning. -This option is only applicable to batch job submission (see \fB\-\-batch\fR). - .TP \fB\-\-ntasks\-per\-node\fR=\fIntasks\fR Request that no more than \fIntasks\fR be invoked on each node. @@ -886,42 +820,6 @@ Request that a specific list of hosts not be included in the resources allocated to this job. The host list will be assumed to be a filename if it contains a "/"character. -.PP -Allocate options. NOTE: This functionality has been moved to a new command, -salloc. These options will be removed from srun at a later date. - -.TP -\fB\-A\fR, \fB\-\-allocate\fR -allocate resources and spawn a shell. When \fB\-\-allocate\fR is specified to -\fBsrun\fR, no remote tasks are started. Instead a subshell is started that -has access to the allocated resources. Multiple jobs can then be run on the -same cpus from within this subshell. See \fBAllocate Mode\fR below. - -.TP -\fB\-\-no\-shell\fR -immediately exit after allocating resources instead of spawning a -shell when used with the \fB\-A\fR, \fB\-\-allocate\fR option. - -.PP -Attach to running job. NOTE: This functionality has been moved to a new -command, sattach. These options will be removed from srun at a later date. - -.TP -\fB\-a\fR, \fB\-\-attach\fR=\fIid\fR -This option will attach \fBsrun\fR -to a running job with job id = \fIid\fR. Provided that the calling user -has access to that running job, stdout and stderr will be redirected to the -current session (assuming that the tasks' stdout and stderr are not connected -directly to files). stdin is not connected to the remote tasks, and signals -are not forwarded unless the \fB\-\-join\fR parameter is also specified. - -.TP -\fB\-j\fR, \fB\-\-join\fR -Used in conjunction with \fB\-\-attach\fR to specify that stdin should -also be connected to the remote tasks (assuming that the remote tasks' -stdin are not directly connected to files), and signals sent to \fBsrun\fR -will be forwarded to the remote tasks. - .PP The following options support Blue Gene systems, but may be applicable to other systems as well. @@ -973,9 +871,6 @@ Default from \fIblugene.conf\fR if not set. Force the allocated nodes to reboot before starting the job. .PP -Unless the \fB\-a\fR (\fB\-\-attach\fR) or \fB\-A\fR (\fB\-\-allocate\fR) -options are specified (see \fBAllocate mode\fR and \fBAttaching to jobs\fR -below), .B srun will submit the job request to the slurm job controller, then initiate all processes on the remote nodes. If the request cannot be met immediately, @@ -1056,12 +951,7 @@ to this same task. .B srun will redirect stdout and/or stderr to the named file from all tasks. stdin will be redirected from the named file and broadcast to all -tasks in the job. If the job is submitted in batch mode using the -.B \-b -or -.B \-\-batch -option, \fIfilename\fR refers to a path on each of the nodes on which -the job runs. Otherwise \fIfilename\fR refers to a path on the host +tasks in the job. \fIfilename\fR refers to a path on the host that runs \fBsrun\fR. Depending on the cluster's file system layout, this may result in the output appearing in different places depending on whether the job is run in batch mode. @@ -1114,41 +1004,7 @@ job128\-00.out, job128\-01.out, ... .PP .RS -10 .PP -.B "Allocate Mode" -.PP -When the allocate option is specified (\fB\-A\fR, \fB\-\-allocate\fR) -\fBsrun\fR will not initiate any remote processes after acquiring -resources. Instead, \fBsrun\fR will spawn a subshell which has access -to the acquired resources. Subsequent instances of \fBsrun\fR from within -this subshell will then run on these resources. -.PP -If the name of a script is specified on the -commandline with \fB\-\-allocate\fR, the spawned shell will run the -specified script. Resources allocated in this way will only be freed -when the subshell terminates. -.PP -.B "Attaching to a running job" -.PP -Use of the \fB\-a\fR \fIjobid\fR (or \fB\-\-attach\fR) option allows -\fBsrun\fR to reattach to a running job, receiving stdout and stderr -from the job and forwarding signals to the job, just as if the current -session of \fBsrun\fR had started the job. (stdin, however, cannot -be forwarded to the job). -.PP -There are two ways to reattach to a running job. The default method -is to attach to the current job read\-only. In this case, -stdout and stderr are duplicated to the attaching \fBsrun\fR, but -signals are not forwarded to the remote processes (A single -Ctrl\-C will detach this read\-only \fBsrun\fR from the job). If -the \fB\-j\fR (\fB\-\-join\fR) option is is also specified, -\fBsrun\fR "joins" the running job, and is able to forward signals, -connects stdin, and acts for the most part much like the \fBsrun\fR -process that initiated the job. -.PP -Node and CPU selection options do not make sense when specifying -\fB\-\-attach\fR, and it is an error to use \fB\-n\fR, \fB\-c\fR, -or \fB\-N\fR in attach mode. -.PP + .SH "ENVIRONMENT VARIABLES" .PP Some srun options may be set via environment variables. @@ -1225,9 +1081,6 @@ Same as \fB\-\-ntasks\-per\-node\fRa \fBSLURN_NTASKS_PER_SOCKET\fR Same as \fB\-\-ntasks\-per\-socket\fRa .TP -\fBSLURM_NO_REQUEUE\fR -Same as \fB\-\-no\-requeue\fR -.TP \fBSLURM_NO_ROTATE\fR Same as \fB\-\-no\-rotate\fR .TP @@ -1473,26 +1326,6 @@ the request. The output of each task will be proceeded with its task number. 6: dev3 7: dev3 -.fi -.PP -This example demonstrates how one might submit a script for later -execution (batch mode). The script will be initiated when resources -are available and no higher priority job is pending for the same -partition. The script will execute on 4 nodes with one task per node -implicit. Note that the script executes on one node. For the script -to utilize all allocated nodes, it must execute the \fBsrun\fR command -or an MPI program. - -.nf - -> cat test.sh -#!/bin/sh -date -srun \-l hostname - -> srun \-N4 \-b test.sh -srun: jobid 42 submitted - .fi .PP The output of test.sh would be found in the default output file @@ -1511,7 +1344,7 @@ echo $SLURM_NODELIST srun \-lN2 \-r2 hostname srun \-lN2 hostname -> srun \-A \-N4 test.sh +> salloc \-N4 test.sh dev[7\-10] 0: dev9 1: dev10 @@ -1534,7 +1367,7 @@ squeue squeue \-s wait -> srun \-A \-N4 test.sh +> salloc \-N4 test.sh JOBID PARTITION NAME USER ST TIME NODES NODELIST 65641 batch test.sh grondo R 0:01 4 dev[7\-10] @@ -1565,7 +1398,7 @@ mpirun \-np $SLURM_NPROCS \-machinefile $MACHINEFILE mpi\-app rm $MACHINEFILE -> srun \-AN2 \-n4 test.sh +> salloc -N2 \-n4 test.sh .fi .PP diff --git a/src/slaunch/Makefile.am b/src/slaunch/Makefile.am index 309c40008390a2cb4910f3dc89fca9b0d1d00c06..397a12ee9bf456f5dc7a8edc44c85078e19c2b2b 100644 --- a/src/slaunch/Makefile.am +++ b/src/slaunch/Makefile.am @@ -9,8 +9,8 @@ bin_PROGRAMS = slaunch slaunch_SOURCES = \ slaunch.c slaunch.h \ opt.c opt.h \ - attach.h \ - attach.c \ + debugger.h \ + debugger.c \ fname.c \ fname.h \ sigstr.c \ diff --git a/src/slaunch/Makefile.in b/src/slaunch/Makefile.in index 95e561bcae764c1eb352e69298e91062f6ede4f9..f2311fd9d42dd4a7e44543029f3f33769df519e5 100644 --- a/src/slaunch/Makefile.in +++ b/src/slaunch/Makefile.in @@ -68,9 +68,10 @@ CONFIG_CLEAN_FILES = am__installdirs = "$(DESTDIR)$(bindir)" binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) PROGRAMS = $(bin_PROGRAMS) -am_slaunch_OBJECTS = slaunch.$(OBJEXT) opt.$(OBJEXT) attach.$(OBJEXT) \ - fname.$(OBJEXT) sigstr.$(OBJEXT) core-format.$(OBJEXT) \ - multi_prog.$(OBJEXT) slaunch.wrapper.$(OBJEXT) +am_slaunch_OBJECTS = slaunch.$(OBJEXT) opt.$(OBJEXT) \ + debugger.$(OBJEXT) fname.$(OBJEXT) sigstr.$(OBJEXT) \ + core-format.$(OBJEXT) multi_prog.$(OBJEXT) \ + slaunch.wrapper.$(OBJEXT) slaunch_OBJECTS = $(am_slaunch_OBJECTS) slaunch_DEPENDENCIES = $(convenience_libs) slaunch_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ @@ -253,8 +254,8 @@ INCLUDES = -I$(top_srcdir) slaunch_SOURCES = \ slaunch.c slaunch.h \ opt.c opt.h \ - attach.h \ - attach.c \ + debugger.h \ + debugger.c \ fname.c \ fname.h \ sigstr.c \ @@ -340,8 +341,8 @@ mostlyclean-compile: distclean-compile: -rm -f *.tab.c -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/attach.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/core-format.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/debugger.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fname.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/multi_prog.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/opt.Po@am__quote@ diff --git a/src/slaunch/attach.c b/src/slaunch/debugger.c similarity index 89% rename from src/slaunch/attach.c rename to src/slaunch/debugger.c index 009c9928a41c39fce94d20701fd61d2097525a2e..99089dbc2f3ffdfaf9e81282a68f4c5298299ead 100644 --- a/src/slaunch/attach.c +++ b/src/slaunch/debugger.c @@ -1,6 +1,6 @@ /*****************************************************************************\ - * attach.c - Definitions needed for parallel debugger - * $Id$ + * debugger.c - Definitions needed for parallel debugger + * $Id: debugger.c 11149 2007-03-14 20:53:19Z morrone $ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -31,10 +31,10 @@ #include "src/common/log.h" -#include "src/slaunch/attach.h" +#include "src/slaunch/debugger.h" /* - * Instantiate extern variables from attach.h + * Instantiate extern variables from debugger.h */ MPIR_PROCDESC *MPIR_proctable; int MPIR_proctable_size; diff --git a/src/slaunch/attach.h b/src/slaunch/debugger.h similarity index 93% rename from src/slaunch/attach.h rename to src/slaunch/debugger.h index c1b50d26d4ca29f20587eb8de384016a0789e755..9f8bb7b6a7978ee1e57b9ba10e02a0e7b9ffc406 100644 --- a/src/slaunch/attach.h +++ b/src/slaunch/debugger.h @@ -1,5 +1,5 @@ /****************************************************************************\ - * attach.h - definitions needed for TotalView interactions + * debugger.h - definitions needed for TotalView interactions ***************************************************************************** * This file was supplied by James Cownie <jcownie@etnus.com> and provides * information required to interface Slurm to the TotalView debugger from @@ -7,7 +7,7 @@ * http://www.etnus.com/ \*****************************************************************************/ -/* $Id$ +/* $Id: debugger.h 11149 2007-03-14 20:53:19Z morrone $ */ /* This file contains support for bringing processes up stopped, so that @@ -19,8 +19,8 @@ * Nov 27 1996 jcownie@dolphinics.com: Added the executable_name to MPIR_PROCDESC */ -#ifndef _SLAUNCH_ATTACH_INCLUDE -#define _SLAUNCH_ATTACH_INCLUDE +#ifndef _SLAUNCH_DEBUGGER_INCLUDE +#define _SLAUNCH_DEBUGGER_INCLUDE #ifndef VOLATILE #if defined(__STDC__) || defined(__cplusplus) diff --git a/src/slaunch/multi_prog.c b/src/slaunch/multi_prog.c index 46c3383ef9ae4fddf3da9b0385bc84de1ecd71c5..69515022c924e2ae85ec7726226a36c12a6a9438 100644 --- a/src/slaunch/multi_prog.c +++ b/src/slaunch/multi_prog.c @@ -47,7 +47,7 @@ #include "src/common/xassert.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" -#include "src/slaunch/attach.h" +#include "src/slaunch/debugger.h" /* Given a program name, translate it to a fully qualified pathname * as needed based upon the PATH environment variable */ diff --git a/src/slaunch/opt.c b/src/slaunch/opt.c index 83658f41f5660ed313b3448f930820d696da8376..8516970b9fab18d834706dd219813a017431e004 100644 --- a/src/slaunch/opt.c +++ b/src/slaunch/opt.c @@ -75,7 +75,7 @@ #include "src/common/mpi.h" #include "src/api/pmi_server.h" -#include "src/slaunch/attach.h" +#include "src/slaunch/debugger.h" /* generic OPT_ definitions -- mainly for use with env vars */ #define OPT_NONE 0x00 @@ -160,9 +160,9 @@ static char *_search_path(char *, int); static void _usage(void); static int _verify_cpu_bind(const char *arg, char **cpu_bind, - cpu_bind_type_t *cpu_bind_type); + cpu_bind_type_t *flags); static int _verify_mem_bind(const char *arg, char **mem_bind, - mem_bind_type_t *mem_bind_type); + mem_bind_type_t *flags); static task_dist_states_t _verify_dist_type(const char *arg, uint32_t *psize); /*---[ end forward declarations of static functions ]---------------------*/ @@ -178,6 +178,9 @@ int initialize_and_process_args(int argc, char *argv[]) /* initialize options with argv */ _opt_args(argc, argv); + if (!_opt_verify()) + exit(1); + if (opt.verbose > 1) _opt_list(); @@ -209,27 +212,64 @@ static int _isvalue(char *arg) { return 0; /* not a value */ } +/* + * First clear all of the bits in "*data" which are set in "clear_mask". + * Then set all of the bits in "*data" that are set in "set_mask". + */ +static void clear_then_set(int *data, int clear_mask, int set_mask) +{ + *data &= ~clear_mask; + *data |= set_mask; +} + +static void _print_cpu_bind_help() +{ + printf( +"CPU bind options:\n" +" --cpu_bind= Bind tasks to CPUs\n" +" q[uiet] quietly bind before task runs (default)\n" +" v[erbose] verbosely report binding before task runs\n" +" no[ne] don't bind tasks to CPUs (default)\n" +" rank bind by task rank\n" +" map_cpu:<list> specify a CPU ID binding for each task\n" +" where <list> is <cpuid1>,<cpuid2>,...<cpuidN>\n" +" mask_cpu:<list> specify a CPU ID binding mask for each task\n" +" where <list> is <mask1>,<mask2>,...<maskN>\n" +" sockets auto-generated masks bind to sockets\n" +" cores auto-generated masks bind to cores\n" +" threads auto-generated masks bind to threads\n" +" help show this help message\n"); +} + /* * verify cpu_bind arguments + * + * we support different launch policy names + * we also allow a verbose setting to be specified + * --cpu_bind=threads + * --cpu_bind=cores + * --cpu_bind=sockets + * --cpu_bind=v + * --cpu_bind=rank,v + * --cpu_bind=rank + * --cpu_bind={MAP_CPU|MASK_CPU}:0,1,2,3,4 + * + * * returns -1 on error, 0 otherwise */ static int _verify_cpu_bind(const char *arg, char **cpu_bind, - cpu_bind_type_t *cpu_bind_type) + cpu_bind_type_t *flags) { char *buf, *p, *tok; - if (!arg) { + int bind_bits = + CPU_BIND_NONE|CPU_BIND_RANK|CPU_BIND_MAP|CPU_BIND_MASK; + int bind_to_bits = + CPU_BIND_TO_SOCKETS|CPU_BIND_TO_CORES|CPU_BIND_TO_THREADS; + + if (arg == NULL) { return 0; } - /* we support different launch policy names - * we also allow a verbose setting to be specified - * --cpu_bind=threads - * --cpu_bind=cores - * --cpu_bind=sockets - * --cpu_bind=v - * --cpu_bind=rank,v - * --cpu_bind=rank - * --cpu_bind={MAP_CPU|MASK_CPU}:0,1,2,3,4 - */ + buf = xstrdup(arg); p = buf; /* change all ',' delimiters not followed by a digit to ';' */ @@ -243,50 +283,27 @@ static int _verify_cpu_bind(const char *arg, char **cpu_bind, p = buf; while ((tok = strsep(&p, ";"))) { if (strcasecmp(tok, "help") == 0) { - printf( -"CPU bind options:\n" -" --cpu_bind= Bind tasks to CPUs\n" -" q[uiet] quietly bind before task runs (default)\n" -" v[erbose] verbosely report binding before task runs\n" -" no[ne] don't bind tasks to CPUs (default)\n" -" rank bind by task rank\n" -" map_cpu:<list> specify a CPU ID binding for each task\n" -" where <list> is <cpuid1>,<cpuid2>,...<cpuidN>\n" -" mask_cpu:<list> specify a CPU ID binding mask for each task\n" -" where <list> is <mask1>,<mask2>,...<maskN>\n" -" sockets auto-generated masks bind to sockets\n" -" cores auto-generated masks bind to cores\n" -" threads auto-generated masks bind to threads\n" -" help show this help message\n"); + _print_cpu_bind_help(); return 1; } else if ((strcasecmp(tok, "q") == 0) || (strcasecmp(tok, "quiet") == 0)) { - *cpu_bind_type &= ~CPU_BIND_VERBOSE; + *flags &= ~CPU_BIND_VERBOSE; } else if ((strcasecmp(tok, "v") == 0) || (strcasecmp(tok, "verbose") == 0)) { - *cpu_bind_type |= CPU_BIND_VERBOSE; + *flags |= CPU_BIND_VERBOSE; } else if ((strcasecmp(tok, "no") == 0) || (strcasecmp(tok, "none") == 0)) { - *cpu_bind_type |= CPU_BIND_NONE; - *cpu_bind_type &= ~CPU_BIND_RANK; - *cpu_bind_type &= ~CPU_BIND_MAP; - *cpu_bind_type &= ~CPU_BIND_MASK; + clear_then_set((int *)flags, bind_bits, CPU_BIND_NONE); xfree(*cpu_bind); } else if (strcasecmp(tok, "rank") == 0) { - *cpu_bind_type &= ~CPU_BIND_NONE; - *cpu_bind_type |= CPU_BIND_RANK; - *cpu_bind_type &= ~CPU_BIND_MAP; - *cpu_bind_type &= ~CPU_BIND_MASK; + clear_then_set((int *)flags, bind_bits, CPU_BIND_RANK); xfree(*cpu_bind); } else if ((strncasecmp(tok, "map_cpu", 7) == 0) || (strncasecmp(tok, "mapcpu", 6) == 0)) { char *list; list = strsep(&tok, ":="); list = strsep(&tok, ":="); - *cpu_bind_type &= ~CPU_BIND_NONE; - *cpu_bind_type &= ~CPU_BIND_RANK; - *cpu_bind_type |= CPU_BIND_MAP; - *cpu_bind_type &= ~CPU_BIND_MASK; + clear_then_set((int *)flags, bind_bits, CPU_BIND_MAP); xfree(*cpu_bind); if (list && *list) { *cpu_bind = xstrdup(list); @@ -300,10 +317,7 @@ static int _verify_cpu_bind(const char *arg, char **cpu_bind, char *list; list = strsep(&tok, ":="); list = strsep(&tok, ":="); - *cpu_bind_type &= ~CPU_BIND_NONE; - *cpu_bind_type &= ~CPU_BIND_RANK; - *cpu_bind_type &= ~CPU_BIND_MAP; - *cpu_bind_type |= CPU_BIND_MASK; + clear_then_set((int *)flags, bind_bits, CPU_BIND_MASK); xfree(*cpu_bind); if (list && *list) { *cpu_bind = xstrdup(list); @@ -314,19 +328,16 @@ static int _verify_cpu_bind(const char *arg, char **cpu_bind, } } else if ((strcasecmp(tok, "socket") == 0) || (strcasecmp(tok, "sockets") == 0)) { - *cpu_bind_type |= CPU_BIND_TO_SOCKETS; - *cpu_bind_type &= ~CPU_BIND_TO_CORES; - *cpu_bind_type &= ~CPU_BIND_TO_THREADS; + clear_then_set((int *)flags, bind_to_bits, + CPU_BIND_TO_SOCKETS); } else if ((strcasecmp(tok, "core") == 0) || (strcasecmp(tok, "cores") == 0)) { - *cpu_bind_type &= ~CPU_BIND_TO_SOCKETS; - *cpu_bind_type |= CPU_BIND_TO_CORES; - *cpu_bind_type &= ~CPU_BIND_TO_THREADS; + clear_then_set((int *)bind, bind_to_bits, + CPU_BIND_TO_CORES); } else if ((strcasecmp(tok, "thread") == 0) || (strcasecmp(tok, "threads") == 0)) { - *cpu_bind_type &= ~CPU_BIND_TO_SOCKETS; - *cpu_bind_type &= ~CPU_BIND_TO_CORES; - *cpu_bind_type |= CPU_BIND_TO_THREADS; + clear_then_set((int *)flags, bind_to_bits, + CPU_BIND_TO_THREADS); } else { error("unrecognized --cpu_bind argument \"%s\"", tok); xfree(buf); @@ -338,24 +349,46 @@ static int _verify_cpu_bind(const char *arg, char **cpu_bind, return 0; } +static void _print_mem_bind_help() +{ + printf( +"Memory bind options:\n" +" --mem_bind= Bind memory to locality domains (ldom)\n" +" q[uiet] quietly bind before task runs (default)\n" +" v[erbose] verbosely report binding before task runs\n" +" no[ne] don't bind tasks to memory (default)\n" +" rank bind by task rank\n" +" local bind to memory local to processor\n" +" map_mem:<list> specify a memory binding for each task\n" +" where <list> is <cpuid1>,<cpuid2>,...<cpuidN>\n" +" mask_mem:<list> specify a memory binding mask for each tasks\n" +" where <list> is <mask1>,<mask2>,...<maskN>\n" +" help show this help message\n"); +} + /* * verify mem_bind arguments + * + * we support different memory binding names + * we also allow a verbose setting to be specified + * --mem_bind=v + * --mem_bind=rank,v + * --mem_bind=rank + * --mem_bind={MAP_MEM|MASK_MEM}:0,1,2,3,4 + * * returns -1 on error, 0 otherwise */ static int _verify_mem_bind(const char *arg, char **mem_bind, - mem_bind_type_t *mem_bind_type) + mem_bind_type_t *flags) { char *buf, *p, *tok; - if (!arg) { + int bind_bits = MEM_BIND_NONE|MEM_BIND_RANK|MEM_BIND_LOCAL| + MEM_BIND_MAP|MEM_BIND_MASK; + + if (arg == NULL) { return 0; } - /* we support different memory binding names - * we also allow a verbose setting to be specified - * --mem_bind=v - * --mem_bind=rank,v - * --mem_bind=rank - * --mem_bind={MAP_MEM|MASK_MEM}:0,1,2,3,4 - */ + buf = xstrdup(arg); p = buf; /* change all ',' delimiters not followed by a digit to ';' */ @@ -369,59 +402,31 @@ static int _verify_mem_bind(const char *arg, char **mem_bind, p = buf; while ((tok = strsep(&p, ";"))) { if (strcasecmp(tok, "help") == 0) { - printf( -"Memory bind options:\n" -" --mem_bind= Bind memory to locality domains (ldom)\n" -" q[uiet] quietly bind before task runs (default)\n" -" v[erbose] verbosely report binding before task runs\n" -" no[ne] don't bind tasks to memory (default)\n" -" rank bind by task rank\n" -" local bind to memory local to processor\n" -" map_mem:<list> specify a memory binding for each task\n" -" where <list> is <cpuid1>,<cpuid2>,...<cpuidN>\n" -" mask_mem:<list> specify a memory binding mask for each tasks\n" -" where <list> is <mask1>,<mask2>,...<maskN>\n" -" help show this help message\n"); + _print_mem_bind_help(); return 1; } else if ((strcasecmp(tok, "q") == 0) || (strcasecmp(tok, "quiet") == 0)) { - *mem_bind_type &= ~MEM_BIND_VERBOSE; + *flags &= ~MEM_BIND_VERBOSE; } else if ((strcasecmp(tok, "v") == 0) || (strcasecmp(tok, "verbose") == 0)) { - *mem_bind_type |= MEM_BIND_VERBOSE; + *flags |= MEM_BIND_VERBOSE; } else if ((strcasecmp(tok, "no") == 0) || (strcasecmp(tok, "none") == 0)) { - *mem_bind_type |= MEM_BIND_NONE; - *mem_bind_type &= ~MEM_BIND_RANK; - *mem_bind_type &= ~MEM_BIND_LOCAL; - *mem_bind_type &= ~MEM_BIND_MAP; - *mem_bind_type &= ~MEM_BIND_MASK; + clear_then_set((int *)flags, bind_bits, MEM_BIND_NONE); xfree(*mem_bind); } else if (strcasecmp(tok, "rank") == 0) { - *mem_bind_type &= ~MEM_BIND_NONE; - *mem_bind_type |= MEM_BIND_RANK; - *mem_bind_type &= ~MEM_BIND_LOCAL; - *mem_bind_type &= ~MEM_BIND_MAP; - *mem_bind_type &= ~MEM_BIND_MASK; + clear_then_set((int *)flags, bind_bits, MEM_BIND_RANK); xfree(*mem_bind); } else if (strcasecmp(tok, "local") == 0) { - *mem_bind_type &= ~MEM_BIND_NONE; - *mem_bind_type &= ~MEM_BIND_RANK; - *mem_bind_type |= MEM_BIND_LOCAL; - *mem_bind_type &= ~MEM_BIND_MAP; - *mem_bind_type &= ~MEM_BIND_MASK; + clear_then_set((int *)flags, bind_bits, MEM_BIND_LOCAL); xfree(*mem_bind); } else if ((strncasecmp(tok, "map_mem", 7) == 0) || (strncasecmp(tok, "mapmem", 6) == 0)) { char *list; list = strsep(&tok, ":="); list = strsep(&tok, ":="); - *mem_bind_type &= ~MEM_BIND_NONE; - *mem_bind_type &= ~MEM_BIND_RANK; - *mem_bind_type &= ~MEM_BIND_LOCAL; - *mem_bind_type |= MEM_BIND_MAP; - *mem_bind_type &= ~MEM_BIND_MASK; + clear_then_set((int *)flags, bind_bits, MEM_BIND_MAP); xfree(*mem_bind); if (list && *list) { *mem_bind = xstrdup(list); @@ -435,11 +440,7 @@ static int _verify_mem_bind(const char *arg, char **mem_bind, char *list; list = strsep(&tok, ":="); list = strsep(&tok, ":="); - *mem_bind_type &= ~MEM_BIND_NONE; - *mem_bind_type &= ~MEM_BIND_RANK; - *mem_bind_type &= ~MEM_BIND_LOCAL; - *mem_bind_type &= ~MEM_BIND_MAP; - *mem_bind_type |= MEM_BIND_MASK; + clear_then_set((int *)flags, bind_bits, MEM_BIND_MASK); xfree(*mem_bind); if (list && *list) { *mem_bind = xstrdup(list); @@ -1547,9 +1548,6 @@ static void _opt_args(int argc, char **argv) opt.argv[0] = fullpath; } } - - if (!_opt_verify()) - exit(1); } static bool diff --git a/src/slaunch/slaunch.c b/src/slaunch/slaunch.c index 8d783768350e5747a27f8a3982563b3459a2ee99..ba9ac6d74abe5e543db2e1c58af701a336bf533a 100644 --- a/src/slaunch/slaunch.c +++ b/src/slaunch/slaunch.c @@ -78,7 +78,7 @@ #include "src/slaunch/opt.h" #include "src/slaunch/sigstr.h" -#include "src/slaunch/attach.h" +#include "src/slaunch/debugger.h" #include "src/slaunch/slaunch.h" #include "src/slaunch/fname.h" #include "src/slaunch/multi_prog.h" diff --git a/src/srun/Makefile.am b/src/srun/Makefile.am index 38612377873b936a1179ed9502b0557ae7ce4ee5..2c728a229659317ffc98c184404004a3d74795ca 100644 --- a/src/srun/Makefile.am +++ b/src/srun/Makefile.am @@ -14,10 +14,8 @@ srun_SOURCES = \ signals.c signals.h \ launch.c \ launch.h \ - attach.h \ - attach.c \ - reattach.c \ - reattach.h \ + debugger.h \ + debugger.c \ fname.c \ fname.h \ sigstr.c \ diff --git a/src/srun/Makefile.in b/src/srun/Makefile.in index 75f78464bd7515bcb48e18f6b4a7e407460dd3ee..cbb2a13dad188eebbca6744c0f54067ea76c4f1d 100644 --- a/src/srun/Makefile.in +++ b/src/srun/Makefile.in @@ -70,9 +70,9 @@ binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) PROGRAMS = $(bin_PROGRAMS) am_srun_OBJECTS = srun.$(OBJEXT) opt.$(OBJEXT) srun_job.$(OBJEXT) \ msg.$(OBJEXT) signals.$(OBJEXT) launch.$(OBJEXT) \ - attach.$(OBJEXT) reattach.$(OBJEXT) fname.$(OBJEXT) \ - sigstr.$(OBJEXT) allocate.$(OBJEXT) core-format.$(OBJEXT) \ - multi_prog.$(OBJEXT) srun.wrapper.$(OBJEXT) + debugger.$(OBJEXT) fname.$(OBJEXT) sigstr.$(OBJEXT) \ + allocate.$(OBJEXT) core-format.$(OBJEXT) multi_prog.$(OBJEXT) \ + srun.wrapper.$(OBJEXT) srun_OBJECTS = $(am_srun_OBJECTS) srun_DEPENDENCIES = $(convenience_libs) srun_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ @@ -260,10 +260,8 @@ srun_SOURCES = \ signals.c signals.h \ launch.c \ launch.h \ - attach.h \ - attach.c \ - reattach.c \ - reattach.h \ + debugger.h \ + debugger.c \ fname.c \ fname.h \ sigstr.c \ @@ -352,14 +350,13 @@ distclean-compile: -rm -f *.tab.c @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/allocate.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/attach.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/core-format.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/debugger.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fname.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/launch.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/msg.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/multi_prog.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/opt.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/reattach.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/signals.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sigstr.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/srun.Po@am__quote@ diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 5bcdec4a9fdcbea491cc0251432a485b52bdaa2d..64497f22e02d17e0544133888c999225467c9771 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -59,7 +59,7 @@ #include "src/srun/allocate.h" #include "src/srun/msg.h" #include "src/srun/opt.h" -#include "src/srun/attach.h" +#include "src/srun/debugger.h" #define MAX_ALLOC_WAIT 60 /* seconds */ #define MIN_ALLOC_WAIT 5 /* seconds */ @@ -89,7 +89,7 @@ int allocate_test(void) { int rc; - job_desc_msg_t *j = job_desc_msg_create_from_opts (NULL); + job_desc_msg_t *j = job_desc_msg_create_from_opts(); if(!j) return SLURM_ERROR; @@ -106,7 +106,7 @@ allocate_nodes(void) SigFunc *oquitf, *ointf, *otermf; sigset_t oset; resource_allocation_response_msg_t *resp = NULL; - job_desc_msg_t *j = job_desc_msg_create_from_opts (NULL); + job_desc_msg_t *j = job_desc_msg_create_from_opts(); if(!j) return NULL; @@ -391,7 +391,7 @@ _intr_handler(int signo) * (see opt.h) */ job_desc_msg_t * -job_desc_msg_create_from_opts (char *script) +job_desc_msg_create_from_opts () { job_desc_msg_t *j = xmalloc(sizeof(*j)); char buf[8192]; @@ -537,36 +537,6 @@ job_desc_msg_create_from_opts (char *script) j->alloc_resp_port = slurmctld_comm_addr.port; j->other_port = slurmctld_comm_addr.port; - if (script) { - /* - * If script is set then we are building a request for - * a batch job - */ - xassert (opt.batch); - - j->environment = NULL; - if (opt.get_user_env) { - struct passwd *pw = NULL; - pw = getpwuid(opt.uid); - if (pw != NULL) { - j->environment = - env_array_user_default(pw->pw_name); - /* FIXME - should we abort if j->environment - is NULL? */ - } - } - env_array_merge(&j->environment, (const char **)environ); - j->env_size = envcount (j->environment); - j->script = script; - j->argv = remote_argv; - j->argc = remote_argc; - j->err = opt.efname; - j->in = opt.ifname; - j->out = opt.ofname; - j->work_dir = opt.cwd; - j->no_requeue = opt.no_requeue; - } - return (j); } diff --git a/src/srun/allocate.h b/src/srun/allocate.h index f1a3b27e914a9dc0a8f5e83b8538968c2b1e4645..d6ce45bd1ce6ab71211dd778bafc8b68b4b95756 100644 --- a/src/srun/allocate.h +++ b/src/srun/allocate.h @@ -61,10 +61,10 @@ int allocate_test(void); /* * Create a job_desc_msg_t object, filled in from the current srun options - * (see opt.h), if script != NULL then this is a batch job. + * (see opt.h) * The resulting memory must be freed with job_desc_msg_destroy() */ -job_desc_msg_t * job_desc_msg_create_from_opts (char *script); +job_desc_msg_t * job_desc_msg_create_from_opts (); /* * Destroy (free memory from) a job_desc_msg_t object allocated with diff --git a/src/srun/attach.c b/src/srun/debugger.c similarity index 92% rename from src/srun/attach.c rename to src/srun/debugger.c index 2fd349546b4c40295d0ac750f22ff81c946e7d2c..49ecfbf393608f028fb168b3f13ee21049b44941 100644 --- a/src/srun/attach.c +++ b/src/srun/debugger.c @@ -1,6 +1,6 @@ /*****************************************************************************\ - * attach.c - Definitions needed for parallel debugger - * $Id$ + * debugger.c - Definitions needed for parallel debugger + * $Id: debugger.c 11149 2007-03-14 20:53:19Z morrone $ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -42,10 +42,10 @@ #include "src/common/log.h" -#include "src/srun/attach.h" +#include "src/srun/debugger.h" /* - * Instantiate extern variables from attach.h + * Instantiate extern variables from debugger.h */ MPIR_PROCDESC *MPIR_proctable; int MPIR_proctable_size; diff --git a/src/srun/attach.h b/src/srun/debugger.h similarity index 93% rename from src/srun/attach.h rename to src/srun/debugger.h index 965f264b39b224d664d7f9b6cae0ab308916ef65..ec3c6ac78beb364084f066829c40848bd86a257d 100644 --- a/src/srun/attach.h +++ b/src/srun/debugger.h @@ -1,5 +1,5 @@ /****************************************************************************\ - * attach.h - definitions needed for TotalView interactions + * debugger.h - definitions needed for TotalView interactions ***************************************************************************** * This file was supplied by James Cownie <jcownie@etnus.com> and provides * information required to interface Slurm to the TotalView debugger from @@ -7,7 +7,7 @@ * http://www.etnus.com/ \*****************************************************************************/ -/* $Id$ +/* $Id: debugger.h 11149 2007-03-14 20:53:19Z morrone $ */ /* This file contains support for bringing processes up stopped, so that @@ -19,8 +19,8 @@ * Nov 27 1996 jcownie@dolphinics.com: Added the executable_name to MPIR_PROCDESC */ -#ifndef _ATTACH_INCLUDE -#define _ATTACH_INCLUDE +#ifndef _DEBUGGER_INCLUDE +#define _DEBUGGER_INCLUDE #ifndef VOLATILE #if defined(__STDC__) || defined(__cplusplus) diff --git a/src/srun/launch.c b/src/srun/launch.c index cbb8d0799c9387b5597e669f679e4c852ae5b8d9..6edd6c7652a4e937a3dc75b20ce9a8b7541d9b81 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -125,8 +125,8 @@ launch(void *arg) r.job_id = job->jobid; r.uid = opt.uid; r.gid = opt.gid; - r.argc = remote_argc; - r.argv = remote_argv; + r.argc = opt.argc; + r.argv = opt.argv; r.cred = job->cred; r.job_step_id = job->stepid; r.envc = my_envc; @@ -278,68 +278,28 @@ static void _update_failed_node(srun_job_t *j, int id) { int i; - pipe_enum_t pipe_enum = PIPE_HOST_STATE; pthread_mutex_lock(&j->task_mutex); if (j->host_state[id] == SRUN_HOST_INIT) { j->host_state[id] = SRUN_HOST_UNREACHABLE; - - if(message_thread) { - safe_write(j->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - safe_write(j->forked_msg->par_msg->msg_pipe[1], - &id,sizeof(int)); - safe_write(j->forked_msg->par_msg->msg_pipe[1], - &j->host_state[id],sizeof(int)); - } } - pipe_enum = PIPE_TASK_STATE; for (i = 0; i < j->step_layout->tasks[id]; i++) { j->task_state[j->step_layout->tids[id][i]] = SRUN_TASK_FAILED; - - if(message_thread) { - safe_write(j->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - safe_write(j->forked_msg->par_msg->msg_pipe[1], - &j->step_layout->tids[id][i], sizeof(int)); - safe_write(j->forked_msg->par_msg->msg_pipe[1], - &j->task_state[j->step_layout->tids[id][i]], - sizeof(int)); - } } pthread_mutex_unlock(&j->task_mutex); /* update_failed_tasks(j, id); */ - return; -rwfail: - pthread_mutex_unlock(&j->task_mutex); - error("_update_failed_node: " - "write from srun message-handler process failed"); } static void _update_contacted_node(srun_job_t *j, int id) { - pipe_enum_t pipe_enum = PIPE_HOST_STATE; pthread_mutex_lock(&j->task_mutex); if (j->host_state[id] == SRUN_HOST_INIT) { j->host_state[id] = SRUN_HOST_CONTACTED; - if(message_thread) { - safe_write(j->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - safe_write(j->forked_msg->par_msg->msg_pipe[1], - &id, sizeof(int)); - safe_write(j->forked_msg->par_msg->msg_pipe[1], - &j->host_state[id], sizeof(int)); - } } pthread_mutex_unlock(&j->task_mutex); - return; -rwfail: - pthread_mutex_unlock(&j->task_mutex); - error("_update_contacted_node: " - "write from srun message-handler process failed"); } static void diff --git a/src/srun/msg.c b/src/srun/msg.c index a13bb33439484c99c307d0b4b799e090c0dba4e7..7c1b648527dd3a333fcb4f76e8bb880c324f4c23 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -73,7 +73,7 @@ #include "src/srun/opt.h" #include "src/srun/msg.h" #include "src/srun/sigstr.h" -#include "src/srun/attach.h" +#include "src/srun/debugger.h" #include "src/srun/allocate.h" #include "src/srun/multi_prog.h" #include "src/srun/signals.h" @@ -103,8 +103,7 @@ static void _msg_thr_poll(srun_job_t *job); static void _set_jfds_nonblocking(srun_job_t *job); static void _print_pid_list(const char *host, int ntasks, uint32_t *pid, char *executable_name); -static void _node_fail_handler(int fd, srun_job_t *job); -static void _node_fail_forwarder(char *nodelist, srun_job_t *job); +static void _node_fail_handler(const char *nodelist, srun_job_t *job); #define _poll_set_rd(_pfd, _fd) do { \ (_pfd).fd = _fd; \ @@ -120,47 +119,11 @@ static void _node_fail_forwarder(char *nodelist, srun_job_t *job); #define _poll_wr_isset(pfd) ((pfd).revents & POLLOUT) #define _poll_err(pfd) ((pfd).revents & POLLERR) -/* fd is job->forked_msg->par_msg->msg_pipe[1] */ -static void _update_mpir_proctable(int fd, srun_job_t *job, +static void _update_mpir_proctable(srun_job_t *job, int nodeid, int ntasks, uint32_t *pid, char *executable) -{ - int msg_type = PIPE_UPDATE_MPIR_PROCTABLE; - int dummy = 0xdeadbeef; - int len; - int i; - - xassert(message_thread); - safe_write(fd, &msg_type, sizeof(int)); /* read by par_thr() */ - safe_write(fd, &dummy, sizeof(int)); /* read by par_thr() */ - - /* the rest are read by _handle_update_mpir_proctable() */ - safe_write(fd, &nodeid, sizeof(int)); - safe_write(fd, &ntasks, sizeof(int)); - len = strlen(executable) + 1; - safe_write(fd, &len, sizeof(int)); - if (len > 0) { - safe_write(fd, executable, len); - } - for (i = 0; i < ntasks; i++) { - int taskid = job->step_layout->tids[nodeid][i]; - safe_write(fd, &taskid, sizeof(int)); - safe_write(fd, &pid[i], sizeof(int)); - } - - return; - -rwfail: - error("_update_mpir_proctable: write to srun main process failed"); -} - -static void _handle_update_mpir_proctable(int fd, srun_job_t *job) { static int tasks_recorded = 0; - int nodeid; - int ntasks; - int len; - char *executable = NULL; int i; char *name = NULL; @@ -173,32 +136,25 @@ static void _handle_update_mpir_proctable(int fd, srun_job_t *job) xstrfmtcat(totalview_jobid, "%u", job->jobid); } - safe_read(fd, &nodeid, sizeof(int)); - safe_read(fd, &ntasks, sizeof(int)); - safe_read(fd, &len, sizeof(int)); - if (len > 0) { - executable = xmalloc(len); - safe_read(fd, executable, len); - - /* remote_argv global will be NULL during an srun --attach */ - if (remote_argv == NULL) { - remote_argc = 1; - xrealloc(remote_argv, 2 * sizeof(char *)); - remote_argv[0] = executable; - remote_argv[1] = NULL; - } + /* FIXME - possibly never, now that --attach is removed */ + /* opt.argv global will be NULL during an srun --attach */ + if (opt.argv == NULL) { + opt.argc = 1; + xrealloc(opt.argv, 2 * sizeof(char *)); + opt.argv[0] = executable; + opt.argv[1] = NULL; } + name = nodelist_nth_host(job->step_layout->node_list, nodeid); for (i = 0; i < ntasks; i++) { MPIR_PROCDESC *tv; - int taskid, pid; + int taskid; - safe_read(fd, &taskid, sizeof(int)); - safe_read(fd, &pid, sizeof(int)); + taskid = job->step_layout->tids[nodeid][i]; tv = &MPIR_proctable[taskid]; tv->host_name = xstrdup(name); - tv->pid = pid; + tv->pid = pid[i]; tv->executable_name = executable; tasks_recorded++; } @@ -215,61 +171,6 @@ static void _handle_update_mpir_proctable(int fd, srun_job_t *job) } return; - -rwfail: - error("_handle_update_mpir_proctable: " - "read from srun message-handler process failed"); -} - -static void _update_step_layout(int fd, slurm_step_layout_t *layout, - int nodeid) -{ - int msg_type = PIPE_UPDATE_STEP_LAYOUT; - int dummy = 0xdeadbeef; - - safe_write(fd, &msg_type, sizeof(int)); /* read by par_thr() */ - safe_write(fd, &dummy, sizeof(int)); /* read by par_thr() */ - - /* the rest are read by _handle_update_step_layout() */ - safe_write(fd, &nodeid, sizeof(int)); - safe_write(fd, &layout->node_cnt, sizeof(uint32_t)); - safe_write(fd, &layout->task_cnt, sizeof(uint32_t)); - safe_write(fd, &layout->tasks[nodeid], sizeof(uint16_t)); - safe_write(fd, layout->tids[nodeid], - layout->tasks[nodeid]*sizeof(uint32_t)); - - return; - -rwfail: - error("_update_step_layout: write to srun main process failed"); -} - -static void _handle_update_step_layout(int fd, slurm_step_layout_t *layout) -{ - int nodeid; - - safe_read(fd, &nodeid, sizeof(int)); - safe_read(fd, &layout->node_cnt, sizeof(uint32_t)); - safe_read(fd, &layout->task_cnt, sizeof(uint32_t)); - xassert(nodeid >= 0 && nodeid <= layout->task_cnt); - - /* If this is the first call to this function, then we probably need - to intialize some of the arrays */ - if (layout->tasks == NULL) - layout->tasks = xmalloc(layout->node_cnt * sizeof(uint16_t *)); - if (layout->tids == NULL) - layout->tids = xmalloc(layout->node_cnt * sizeof(uint32_t *)); - - safe_read(fd, &layout->tasks[nodeid], sizeof(uint16_t)); - xassert(layout->tids[nodeid] == NULL); - layout->tids[nodeid] = xmalloc(layout->tasks[nodeid]*sizeof(uint32_t)); - safe_read(fd, layout->tids[nodeid], - layout->tasks[nodeid]*sizeof(uint32_t)); - return; - -rwfail: - error("_handle_update_step_layout: " - "read from srun message-handler process failed"); } static void _dump_proctable(srun_job_t *job) @@ -294,23 +195,10 @@ static void _dump_proctable(srun_job_t *job) void debugger_launch_failure(srun_job_t *job) { - int i; - pipe_enum_t pipe_enum = PIPE_MPIR_DEBUG_STATE; - if (opt.parallel_debug) { - if(message_thread && job) { - i = MPIR_DEBUG_ABORTING; - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &i, sizeof(int)); - } + MPIR_debug_state = MPIR_DEBUG_ABORTING; + MPIR_Breakpoint(); } - return; -rwfail: - error("debugger_launch_failure: " - "write from srun message-handler process failed"); - } /* @@ -339,10 +227,8 @@ void timeout_handler(time_t timeout) * not. The job will continue to execute given the --no-kill option. * Otherwise all of the job's tasks and the job itself are killed.. */ -static void _node_fail_handler(int fd, srun_job_t *job) +static void _node_fail_handler(const char *nodelist, srun_job_t *job) { - char *nodelist = NULL; - int len = 0; hostset_t fail_nodes, all_nodes; hostlist_iterator_t fail_itr; char *node; @@ -351,12 +237,6 @@ static void _node_fail_handler(int fd, srun_job_t *job) int i, j; int node_id, num_tasks; - /* get the hostlist string of failed nodes from the message thread */ - safe_read(fd, &len, sizeof(int)); - nodelist = (char *)xmalloc(len+1); - safe_read(fd, nodelist, len); - nodelist[len] = '\0'; - /* now process the down nodes and tell the IO client about them */ fail_nodes = hostset_create(nodelist); fail_itr = hostset_iterator_create(fail_nodes); @@ -389,51 +269,13 @@ static void _node_fail_handler(int fd, srun_job_t *job) } slurm_mutex_unlock(&job->task_mutex); - if (!opt.allocate) { - client_io_handler_downnodes(job->client_io, node_ids, - num_node_ids); - } + client_io_handler_downnodes(job->client_io, node_ids, num_node_ids); if (!opt.no_kill) { update_job_state(job, SRUN_JOB_FORCETERM); info("sending SIGINT to remaining tasks"); fwd_signal(job, SIGINT, opt.max_threads); } - - xfree(nodelist); - return; -rwfail: - error("Failure reading node failure message from message process: %m"); - if (nodelist != NULL) - xfree(nodelist); - return; -} - -/* - * Forward the node failure message to the main srun process. - * - * NOTE: this is called from the forked message handling process - */ -static void _node_fail_forwarder(char *nodelist, srun_job_t *job) -{ - pipe_enum_t pipe_enum = PIPE_NODE_FAIL; - int dummy = 0xdeadbeef; - int pipe_fd = job->forked_msg->par_msg->msg_pipe[1]; - int len; - - len = strlen(nodelist); - if (message_thread) { - safe_write(pipe_fd, &pipe_enum, sizeof(int)); - safe_write(pipe_fd, &dummy, sizeof(int)); - - /* the following writes are handled by _node_fail_handler */ - safe_write(pipe_fd, &len, sizeof(int)); - safe_write(pipe_fd, nodelist, len); - } - return; -rwfail: - error("Failure sending node failure message to main process: %m"); - return; } static bool _job_msg_done(srun_job_t *job) @@ -444,7 +286,6 @@ static bool _job_msg_done(srun_job_t *job) static void _process_launch_resp(srun_job_t *job, launch_tasks_response_msg_t *msg) { - pipe_enum_t pipe_enum = PIPE_HOST_STATE; int nodeid = nodelist_find(job->step_layout->node_list, msg->node_name); @@ -456,105 +297,38 @@ _process_launch_resp(srun_job_t *job, launch_tasks_response_msg_t *msg) job->host_state[nodeid] = SRUN_HOST_REPLIED; pthread_mutex_unlock(&job->task_mutex); - if(message_thread) { - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &nodeid, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &job->host_state[nodeid], sizeof(int)); - - } - _update_mpir_proctable(job->forked_msg->par_msg->msg_pipe[1], job, + _update_mpir_proctable(job, nodeid, msg->count_of_pids, - msg->local_pids, remote_argv[0]); + msg->local_pids, opt.argv[0]); _print_pid_list( msg->node_name, msg->count_of_pids, - msg->local_pids, remote_argv[0] ); - return; -rwfail: - error("_process_launch_resp: " - "write from srun message-handler process failed"); - -} - -static void -update_tasks_state(srun_job_t *job, uint32_t nodeid) -{ - int i; - pipe_enum_t pipe_enum = PIPE_TASK_STATE; - slurm_mutex_lock(&job->task_mutex); - debug2("updating %u tasks state for node %u", - job->step_layout->tasks[nodeid], nodeid); - for (i = 0; i < job->step_layout->tasks[nodeid]; i++) { - uint32_t tid = job->step_layout->tids[nodeid][i]; - - if(message_thread) { - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &tid,sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &job->task_state[tid],sizeof(int)); - } - } - slurm_mutex_unlock(&job->task_mutex); + msg->local_pids, opt.argv[0] ); return; -rwfail: - slurm_mutex_unlock(&job->task_mutex); - error("update_tasks_state: " - "write from srun message-handler process failed"); - } static void update_running_tasks(srun_job_t *job, uint32_t nodeid) { int i; - pipe_enum_t pipe_enum = PIPE_TASK_STATE; + debug2("updating %u running tasks for node %u", job->step_layout->tasks[nodeid], nodeid); slurm_mutex_lock(&job->task_mutex); for (i = 0; i < job->step_layout->tasks[nodeid]; i++) { uint32_t tid = job->step_layout->tids[nodeid][i]; job->task_state[tid] = SRUN_TASK_RUNNING; - - if(message_thread) { - safe_write(job->forked_msg-> - par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - safe_write(job->forked_msg-> - par_msg->msg_pipe[1],&tid, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &job->task_state[tid], sizeof(int)); - } } slurm_mutex_unlock(&job->task_mutex); - return; -rwfail: - slurm_mutex_unlock(&job->task_mutex); - error("update_running_tasks: " - "write from srun message-handler process failed"); } static void update_failed_tasks(srun_job_t *job, uint32_t nodeid) { int i; - pipe_enum_t pipe_enum = PIPE_TASK_STATE; slurm_mutex_lock(&job->task_mutex); for (i = 0; i < job->step_layout->tasks[nodeid]; i++) { uint32_t tid = job->step_layout->tids[nodeid][i]; job->task_state[tid] = SRUN_TASK_FAILED; - - if(message_thread) { - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &tid, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &job->task_state[tid], sizeof(int)); - } tasks_exited++; } slurm_mutex_unlock(&job->task_mutex); @@ -563,18 +337,13 @@ update_failed_tasks(srun_job_t *job, uint32_t nodeid) debug2("all tasks exited"); update_job_state(job, SRUN_JOB_TERMINATED); } -rwfail: slurm_mutex_unlock(&job->task_mutex); - error("update_failed_tasks: " - "write from srun message-handler process failed"); - } static void _launch_handler(srun_job_t *job, slurm_msg_t *resp) { launch_tasks_response_msg_t *msg = resp->data; - pipe_enum_t pipe_enum = PIPE_HOST_STATE; int nodeid = nodelist_find(job->step_layout->node_list, msg->node_name); @@ -591,15 +360,6 @@ _launch_handler(srun_job_t *job, slurm_msg_t *resp) job->host_state[nodeid] = SRUN_HOST_REPLIED; slurm_mutex_unlock(&job->task_mutex); - if(message_thread) { - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &nodeid, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &job->host_state[nodeid], - sizeof(int)); - } update_failed_tasks(job, nodeid); /* @@ -615,11 +375,6 @@ _launch_handler(srun_job_t *job, slurm_msg_t *resp) _process_launch_resp(job, msg); update_running_tasks(job, nodeid); } - return; -rwfail: - error("_launch_handler: " - "write from srun message-handler process failed"); - } /* _confirm_launch_complete @@ -655,90 +410,6 @@ _confirm_launch_complete(srun_job_t *job) job->ltimeout = 0; } -static void -_reattach_handler(srun_job_t *job, slurm_msg_t *msg) -{ - int i; - reattach_tasks_response_msg_t *resp = msg->data; - int nodeid = nodelist_find(job->step_layout->node_list, - resp->node_name); - - if ((nodeid < 0) || (nodeid >= job->nhosts)) { - error ("Invalid reattach response received"); - return; - } - - slurm_mutex_lock(&job->task_mutex); - job->host_state[nodeid] = SRUN_HOST_REPLIED; - slurm_mutex_unlock(&job->task_mutex); - - if(message_thread) { - pipe_enum_t pipe_enum = PIPE_HOST_STATE; - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &nodeid, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &job->host_state[nodeid], sizeof(int)); - } - - if (resp->return_code != 0) { - if (job->stepid == NO_VAL) { - error ("Unable to attach to job %d: %s", - job->jobid, slurm_strerror(resp->return_code)); - } else { - error ("Unable to attach to step %d.%d on node %d: %s", - job->jobid, job->stepid, nodeid, - slurm_strerror(resp->return_code)); - } - job->rc = 1; - - update_job_state(job, SRUN_JOB_FAILED); - return; - } - - /* - * store global task id information as returned from slurmd - */ - job->step_layout->tids[nodeid] = - xmalloc( resp->ntasks * sizeof(uint32_t) ); - - job->step_layout->tasks[nodeid] = resp->ntasks; - - info ("ntasks = %d\n"); - - for (i = 0; i < resp->ntasks; i++) { - job->step_layout->tids[nodeid][i] = resp->gtids[i]; - info ("setting task%d on hostid %d\n", - resp->gtids[i], nodeid); - } - _update_step_layout(job->forked_msg->par_msg->msg_pipe[1], - job->step_layout, nodeid); - - /* Build process table for any parallel debugger - */ - if ((remote_argc == 0) && (resp->executable_names)) { - remote_argc = 1; - xrealloc(remote_argv, 2 * sizeof(char *)); - remote_argv[0] = resp->executable_names[0]; - resp->executable_names = NULL; /* nothing left to free */ - remote_argv[1] = NULL; - } - _update_mpir_proctable(job->forked_msg->par_msg->msg_pipe[1], job, - nodeid, resp->ntasks, - resp->local_pids, remote_argv[0]); - - _print_pid_list(resp->node_name, resp->ntasks, resp->local_pids, - remote_argv[0]); - - update_running_tasks(job, nodeid); - return; -rwfail: - error("_reattach_handler: " - "write from srun message-handler process failed"); -} - - static void _print_exit_status(srun_job_t *job, hostlist_t hl, char *host, int status) { @@ -796,25 +467,6 @@ _die_if_signaled(srun_job_t *job, int status) } } -static void -_update_task_exitcode(srun_job_t *job, int taskid) -{ - pipe_enum_t pipe_enum = PIPE_TASK_EXITCODE; - - if(message_thread) { - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &taskid, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &job->tstatus[taskid], sizeof(int)); - } - return; -rwfail: - error("_update_task_exitcode: " - "write from srun message-handler process failed"); -} - static void _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) { @@ -845,7 +497,6 @@ _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) slurm_mutex_lock(&job->task_mutex); job->tstatus[taskid] = status; - _update_task_exitcode(job, taskid); if (status) job->task_state[taskid] = SRUN_TASK_ABNORMAL_EXIT; else { @@ -864,9 +515,6 @@ _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) } } - update_tasks_state(job, slurm_step_layout_host_id(job->step_layout, - task0)); - _print_exit_status(job, hl, host, status); hostlist_destroy(hl); @@ -920,11 +568,6 @@ _handle_msg(srun_job_t *job, slurm_msg_t *msg) _exit_handler(job, msg); slurm_free_task_exit_msg(msg->data); break; - case RESPONSE_REATTACH_TASKS: - debug2("received reattach response"); - _reattach_handler(job, msg); - slurm_free_reattach_tasks_response_msg(msg->data); - break; case SRUN_PING: debug3("slurmctld ping received"); slurm_send_rc_msg(msg, SLURM_SUCCESS); @@ -944,7 +587,7 @@ _handle_msg(srun_job_t *job, slurm_msg_t *msg) case SRUN_NODE_FAIL: verbose("node_fail received"); nf = msg->data; - _node_fail_forwarder(nf->nodelist, job); + _node_fail_handler(nf->nodelist, job); slurm_free_srun_node_fail_msg(msg->data); break; case RESPONSE_RESOURCE_ALLOCATION: @@ -975,7 +618,7 @@ _handle_msg(srun_job_t *job, slurm_msg_t *msg) static void _accept_msg_connection(srun_job_t *job, int fdnum) { - slurm_fd fd = (slurm_fd) NULL; + slurm_fd fd = (slurm_fd) 0; slurm_msg_t *msg = NULL; slurm_addr cli_addr; unsigned char *uc; @@ -1155,132 +798,18 @@ void * msg_thr(void *arg) { srun_job_t *job = (srun_job_t *) arg; - forked_msg_pipe_t *par_msg = job->forked_msg->par_msg; debug3("msg thread pid = %lu", (unsigned long) getpid()); slurm_uid = (uid_t) slurm_get_slurm_user_id(); _msg_thr_poll(job); - close(par_msg->msg_pipe[1]); // close excess fildes debug3("msg thread done"); return (void *)1; } - /* - * This function runs in a pthread of the parent srun process and - * handles messages from the srun message-handler process. - */ -void * -par_thr(void *arg) -{ - srun_job_t *job = (srun_job_t *) arg; - forked_msg_pipe_t *par_msg = job->forked_msg->par_msg; - forked_msg_pipe_t *msg_par = job->forked_msg->msg_par; - int c; - pipe_enum_t type=0; - int tid=-1; - int status; - debug3("par thread pid = %lu", (unsigned long) getpid()); - - //slurm_uid = (uid_t) slurm_get_slurm_user_id(); - close(msg_par->msg_pipe[0]); // close read end of pipe - close(par_msg->msg_pipe[1]); // close write end of pipe - while(read(par_msg->msg_pipe[0], &c, sizeof(int)) - == sizeof(int)) { - // getting info from msg thread - if(type == PIPE_NONE) { - debug2("got type %d\n",c); - type = c; - continue; - } - - switch(type) { - case PIPE_JOB_STATE: - debug("PIPE_JOB_STATE, c = %d", c); - update_job_state(job, c); - break; - case PIPE_TASK_STATE: - debug("PIPE_TASK_STATE, c = %d", c); - if(tid == -1) { - tid = c; - continue; - } - slurm_mutex_lock(&job->task_mutex); - job->task_state[tid] = c; - if(c == SRUN_TASK_FAILED) - tasks_exited++; - slurm_mutex_unlock(&job->task_mutex); - if (tasks_exited == opt.nprocs) { - debug2("all tasks exited"); - update_job_state(job, SRUN_JOB_TERMINATED); - } - tid = -1; - break; - case PIPE_TASK_EXITCODE: - debug("PIPE_TASK_EXITCODE"); - if(tid == -1) { - debug(" setting tid"); - tid = c; - continue; - } - slurm_mutex_lock(&job->task_mutex); - debug(" setting task %d exitcode %d", tid, c); - job->tstatus[tid] = c; - slurm_mutex_unlock(&job->task_mutex); - tid = -1; - break; - case PIPE_HOST_STATE: - if(tid == -1) { - tid = c; - continue; - } - slurm_mutex_lock(&job->task_mutex); - job->host_state[tid] = c; - slurm_mutex_unlock(&job->task_mutex); - tid = -1; - break; - case PIPE_SIGNALED: - slurm_mutex_lock(&job->state_mutex); - job->signaled = c; - slurm_mutex_unlock(&job->state_mutex); - break; - case PIPE_MPIR_DEBUG_STATE: - MPIR_debug_state = c; - MPIR_Breakpoint(); - if (opt.debugger_test) - _dump_proctable(job); - break; - case PIPE_UPDATE_MPIR_PROCTABLE: - _handle_update_mpir_proctable(par_msg->msg_pipe[0], - job); - break; - case PIPE_UPDATE_STEP_LAYOUT: - _handle_update_step_layout(par_msg->msg_pipe[0], - job->step_layout); - break; - case PIPE_NODE_FAIL: - _node_fail_handler(par_msg->msg_pipe[0], job); - break; - default: - error("Unrecognized message from message thread %d", - type); - } - type = PIPE_NONE; - } - close(par_msg->msg_pipe[0]); // close excess fildes - close(msg_par->msg_pipe[1]); // close excess fildes - if(waitpid(par_msg->pid,&status,0)<0) // wait for pid to finish - return NULL;// there was an error - debug3("par thread done"); - return (void *)1; -} - -/* - * Forks the srun process that handles messages even if the main srun - * process is stopped (for instance, by totalview). Also creates - * the various pthreads used in the original and monitor process. + * Create the message handling pthread. * * NOTE: call this before creating any pthreads to avoid having forked process * hang on localtime_t() mutex locked in parent processes pthread. @@ -1290,11 +819,7 @@ msg_thr_create(srun_job_t *job) { int i, retries = 0; pthread_attr_t attr; - int c; - - job->forked_msg = xmalloc(sizeof(forked_msg_t)); - job->forked_msg->par_msg = xmalloc(sizeof(forked_msg_pipe_t)); - job->forked_msg->msg_par = xmalloc(sizeof(forked_msg_pipe_t)); + int rc; set_allocate_job(job); @@ -1310,84 +835,20 @@ msg_thr_create(srun_job_t *job) job->jaddr[i]).sin_port)); } - if (pipe(job->forked_msg->par_msg->msg_pipe) == -1) { - error("pipe(): %m"); - return SLURM_ERROR; - } - if (pipe(job->forked_msg->msg_par->msg_pipe) == -1) { - error("pipe(): %m"); - return SLURM_ERROR; - } - debug2("created the pipes for communication"); - - /* retry fork for super-heavily loaded systems */ - for (i = 0; ; i++) { - if((job->forked_msg->par_msg->pid = fork()) != -1) - break; - if (i < 3) - usleep(1000); - else { - error("fork(): %m"); - return SLURM_ERROR; - } + slurm_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); +thread_create_retry: + rc = pthread_create(&job->msg_tid, &attr, &msg_thr, (void *)job); + if (rc) { + if (++retries > MAX_RETRIES) + fatal("Can't create pthread"); + sleep(1); + goto thread_create_retry; } - if (job->forked_msg->par_msg->pid == 0) { - /* child */ - setsid(); - message_thread = 1; - close(job->forked_msg-> - par_msg->msg_pipe[0]); // close read end of pipe - close(job->forked_msg-> - msg_par->msg_pipe[1]); // close write end of pipe - slurm_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); - while ((errno = pthread_create(&job->jtid, &attr, &msg_thr, - (void *)job))) { - if (++retries > MAX_RETRIES) - fatal("Can't create pthread"); - sleep(1); - } - slurm_attr_destroy(&attr); - debug("Started msg to parent server thread (%lu)", - (unsigned long) job->jtid); + slurm_attr_destroy(&attr); + debug("Started message thread (%lu)", (unsigned long) job->msg_tid); - /* - * Wait for the main srun process to exit. When it - * does, the other end of the msg_par->msg_pipe will - * close. - */ - while(read(job->forked_msg->msg_par->msg_pipe[0], - &c, sizeof(int)) > 0) - ; /* do nothing */ - - close(job->forked_msg->msg_par->msg_pipe[0]); - /* - * These xfree aren't really necessary if we are just going - * to exit, and they can cause the message thread to - * segfault. - */ - /* xfree(job->forked_msg->par_msg); */ - /* xfree(job->forked_msg->msg_par); */ - /* xfree(job->forked_msg); */ - _exit(0); - } else { - /* parent */ - - slurm_attr_init(&attr); - while ((errno = pthread_create(&job->jtid, &attr, &par_thr, - (void *)job))) { - if (++retries > MAX_RETRIES) - fatal("Can't create pthread"); - sleep(1); /* sleep and try again */ - } - slurm_attr_destroy(&attr); - - debug("Started parent to msg server thread (%lu)", - (unsigned long) job->jtid); - } - - return SLURM_SUCCESS; } @@ -1419,9 +880,6 @@ extern slurm_fd slurmctld_msg_init(void) if (slurmctld_fd) /* May set early for queued job allocation */ return slurmctld_fd; - if (opt.allocate && opt.noshell) - return -1; - slurmctld_fd = -1; slurmctld_comm_addr.hostname = NULL; slurmctld_comm_addr.port = 0; diff --git a/src/srun/multi_prog.c b/src/srun/multi_prog.c index 5a98ebd03565d8643d098846bd40903d68d8aee8..3b25096366b4c3ce90dde469672736c26d2d0cc1 100644 --- a/src/srun/multi_prog.c +++ b/src/srun/multi_prog.c @@ -58,7 +58,7 @@ #include "src/common/xassert.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" -#include "src/srun/attach.h" +#include "src/srun/debugger.h" /* Given a program name, translate it to a fully qualified pathname * as needed based upon the PATH environment variable */ diff --git a/src/srun/opt.c b/src/srun/opt.c index a74ace9405edd9287ef633ce982722e8885351e0..971478f3f715172fbc010d798a21b91e9a84c5a0 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -83,7 +83,7 @@ #include "src/api/pmi_server.h" #include "src/srun/opt.h" -#include "src/srun/attach.h" +#include "src/srun/debugger.h" #include "src/common/mpi.h" /* generic OPT_ definitions -- mainly for use with env vars */ @@ -121,7 +121,6 @@ #define LONG_OPT_GID 0x10b #define LONG_OPT_MPI 0x10c #define LONG_OPT_CORE 0x10e -#define LONG_OPT_NOSHELL 0x10f #define LONG_OPT_DEBUG_TS 0x110 #define LONG_OPT_CONNTYPE 0x111 #define LONG_OPT_TEST_ONLY 0x113 @@ -140,7 +139,6 @@ #define LONG_OPT_MEM_BIND 0x120 #define LONG_OPT_CTRL_COMM_IFHN 0x121 #define LONG_OPT_MULTI 0x122 -#define LONG_OPT_NO_REQUEUE 0x123 #define LONG_OPT_COMMENT 0x124 #define LONG_OPT_SOCKETSPERNODE 0x130 #define LONG_OPT_CORESPERSOCKET 0x131 @@ -161,10 +159,7 @@ #define LONG_OPT_GET_USER_ENV 0x145 /*---- global variables, defined in opt.h ----*/ -char **remote_argv; -int remote_argc; int _verbose; -enum modes mode; opt_t opt; /*---- forward declarations of static functions ----*/ @@ -228,10 +223,10 @@ static bool _verify_hint(const char *arg, int *min_threads, int *max_threads, cpu_bind_type_t *cpu_bind_type); static int _verify_cpu_bind(const char *arg, char **cpu_bind, - cpu_bind_type_t *cpu_bind_type); + cpu_bind_type_t *flags); static int _verify_geometry(const char *arg, uint16_t *geometry); static int _verify_mem_bind(const char *arg, char **mem_bind, - mem_bind_type_t *mem_bind_type); + mem_bind_type_t *flags); static int _verify_conn_type(const char *arg); /*---[ end forward declarations of static functions ]---------------------*/ @@ -247,6 +242,9 @@ int initialize_and_process_args(int argc, char *argv[]) /* initialize options with argv */ _opt_args(argc, argv); + if (!_opt_verify()) + exit(1); + if (_verbose > 3) _opt_list(); @@ -426,27 +424,64 @@ static int _isvalue(char *arg) { return 0; /* not a value */ } +/* + * First clear all of the bits in "*data" which are set in "clear_mask". + * Then set all of the bits in "*data" that are set in "set_mask". + */ +static void clear_then_set(int *data, int clear_mask, int set_mask) +{ + *data &= ~clear_mask; + *data |= set_mask; +} + +static void _print_cpu_bind_help() +{ + printf( +"CPU bind options:\n" +" --cpu_bind= Bind tasks to CPUs\n" +" q[uiet] quietly bind before task runs (default)\n" +" v[erbose] verbosely report binding before task runs\n" +" no[ne] don't bind tasks to CPUs (default)\n" +" rank bind by task rank\n" +" map_cpu:<list> specify a CPU ID binding for each task\n" +" where <list> is <cpuid1>,<cpuid2>,...<cpuidN>\n" +" mask_cpu:<list> specify a CPU ID binding mask for each task\n" +" where <list> is <mask1>,<mask2>,...<maskN>\n" +" sockets auto-generated masks bind to sockets\n" +" cores auto-generated masks bind to cores\n" +" threads auto-generated masks bind to threads\n" +" help show this help message\n"); +} + /* * verify cpu_bind arguments + * + * we support different launch policy names + * we also allow a verbose setting to be specified + * --cpu_bind=threads + * --cpu_bind=cores + * --cpu_bind=sockets + * --cpu_bind=v + * --cpu_bind=rank,v + * --cpu_bind=rank + * --cpu_bind={MAP_CPU|MASK_CPU}:0,1,2,3,4 + * + * * returns -1 on error, 0 otherwise */ static int _verify_cpu_bind(const char *arg, char **cpu_bind, - cpu_bind_type_t *cpu_bind_type) + cpu_bind_type_t *flags) { char *buf, *p, *tok; - if (!arg) { + int bind_bits = + CPU_BIND_NONE|CPU_BIND_RANK|CPU_BIND_MAP|CPU_BIND_MASK; + int bind_to_bits = + CPU_BIND_TO_SOCKETS|CPU_BIND_TO_CORES|CPU_BIND_TO_THREADS; + + if (arg == NULL) { return 0; } - /* we support different launch policy names - * we also allow a verbose setting to be specified - * --cpu_bind=threads - * --cpu_bind=cores - * --cpu_bind=sockets - * --cpu_bind=v - * --cpu_bind=rank,v - * --cpu_bind=rank - * --cpu_bind={MAP_CPU|MASK_CPU}:0,1,2,3,4 - */ + buf = xstrdup(arg); p = buf; /* change all ',' delimiters not followed by a digit to ';' */ @@ -460,50 +495,27 @@ static int _verify_cpu_bind(const char *arg, char **cpu_bind, p = buf; while ((tok = strsep(&p, ";"))) { if (strcasecmp(tok, "help") == 0) { - printf( -"CPU bind options:\n" -" --cpu_bind= Bind tasks to CPUs\n" -" q[uiet] quietly bind before task runs (default)\n" -" v[erbose] verbosely report binding before task runs\n" -" no[ne] don't bind tasks to CPUs (default)\n" -" rank bind by task rank\n" -" map_cpu:<list> specify a CPU ID binding for each task\n" -" where <list> is <cpuid1>,<cpuid2>,...<cpuidN>\n" -" mask_cpu:<list> specify a CPU ID binding mask for each task\n" -" where <list> is <mask1>,<mask2>,...<maskN>\n" -" sockets auto-generated masks bind to sockets\n" -" cores auto-generated masks bind to cores\n" -" threads auto-generated masks bind to threads\n" -" help show this help message\n"); + _print_cpu_bind_help(); return 1; } else if ((strcasecmp(tok, "q") == 0) || (strcasecmp(tok, "quiet") == 0)) { - *cpu_bind_type &= ~CPU_BIND_VERBOSE; + *flags &= ~CPU_BIND_VERBOSE; } else if ((strcasecmp(tok, "v") == 0) || (strcasecmp(tok, "verbose") == 0)) { - *cpu_bind_type |= CPU_BIND_VERBOSE; + *flags |= CPU_BIND_VERBOSE; } else if ((strcasecmp(tok, "no") == 0) || (strcasecmp(tok, "none") == 0)) { - *cpu_bind_type |= CPU_BIND_NONE; - *cpu_bind_type &= ~CPU_BIND_RANK; - *cpu_bind_type &= ~CPU_BIND_MAP; - *cpu_bind_type &= ~CPU_BIND_MASK; + clear_then_set((int *)flags, bind_bits, CPU_BIND_NONE); xfree(*cpu_bind); } else if (strcasecmp(tok, "rank") == 0) { - *cpu_bind_type &= ~CPU_BIND_NONE; - *cpu_bind_type |= CPU_BIND_RANK; - *cpu_bind_type &= ~CPU_BIND_MAP; - *cpu_bind_type &= ~CPU_BIND_MASK; + clear_then_set((int *)flags, bind_bits, CPU_BIND_RANK); xfree(*cpu_bind); } else if ((strncasecmp(tok, "map_cpu", 7) == 0) || (strncasecmp(tok, "mapcpu", 6) == 0)) { char *list; list = strsep(&tok, ":="); list = strsep(&tok, ":="); - *cpu_bind_type &= ~CPU_BIND_NONE; - *cpu_bind_type &= ~CPU_BIND_RANK; - *cpu_bind_type |= CPU_BIND_MAP; - *cpu_bind_type &= ~CPU_BIND_MASK; + clear_then_set((int *)flags, bind_bits, CPU_BIND_MAP); xfree(*cpu_bind); if (list && *list) { *cpu_bind = xstrdup(list); @@ -517,10 +529,7 @@ static int _verify_cpu_bind(const char *arg, char **cpu_bind, char *list; list = strsep(&tok, ":="); list = strsep(&tok, ":="); - *cpu_bind_type &= ~CPU_BIND_NONE; - *cpu_bind_type &= ~CPU_BIND_RANK; - *cpu_bind_type &= ~CPU_BIND_MAP; - *cpu_bind_type |= CPU_BIND_MASK; + clear_then_set((int *)flags, bind_bits, CPU_BIND_MASK); xfree(*cpu_bind); if (list && *list) { *cpu_bind = xstrdup(list); @@ -531,19 +540,16 @@ static int _verify_cpu_bind(const char *arg, char **cpu_bind, } } else if ((strcasecmp(tok, "socket") == 0) || (strcasecmp(tok, "sockets") == 0)) { - *cpu_bind_type |= CPU_BIND_TO_SOCKETS; - *cpu_bind_type &= ~CPU_BIND_TO_CORES; - *cpu_bind_type &= ~CPU_BIND_TO_THREADS; + clear_then_set((int *)flags, bind_to_bits, + CPU_BIND_TO_SOCKETS); } else if ((strcasecmp(tok, "core") == 0) || (strcasecmp(tok, "cores") == 0)) { - *cpu_bind_type &= ~CPU_BIND_TO_SOCKETS; - *cpu_bind_type |= CPU_BIND_TO_CORES; - *cpu_bind_type &= ~CPU_BIND_TO_THREADS; + clear_then_set((int *)bind, bind_to_bits, + CPU_BIND_TO_CORES); } else if ((strcasecmp(tok, "thread") == 0) || (strcasecmp(tok, "threads") == 0)) { - *cpu_bind_type &= ~CPU_BIND_TO_SOCKETS; - *cpu_bind_type &= ~CPU_BIND_TO_CORES; - *cpu_bind_type |= CPU_BIND_TO_THREADS; + clear_then_set((int *)flags, bind_to_bits, + CPU_BIND_TO_THREADS); } else { error("unrecognized --cpu_bind argument \"%s\"", tok); xfree(buf); @@ -555,24 +561,46 @@ static int _verify_cpu_bind(const char *arg, char **cpu_bind, return 0; } +static void _print_mem_bind_help() +{ + printf( +"Memory bind options:\n" +" --mem_bind= Bind memory to locality domains (ldom)\n" +" q[uiet] quietly bind before task runs (default)\n" +" v[erbose] verbosely report binding before task runs\n" +" no[ne] don't bind tasks to memory (default)\n" +" rank bind by task rank\n" +" local bind to memory local to processor\n" +" map_mem:<list> specify a memory binding for each task\n" +" where <list> is <cpuid1>,<cpuid2>,...<cpuidN>\n" +" mask_mem:<list> specify a memory binding mask for each tasks\n" +" where <list> is <mask1>,<mask2>,...<maskN>\n" +" help show this help message\n"); +} + /* * verify mem_bind arguments + * + * we support different memory binding names + * we also allow a verbose setting to be specified + * --mem_bind=v + * --mem_bind=rank,v + * --mem_bind=rank + * --mem_bind={MAP_MEM|MASK_MEM}:0,1,2,3,4 + * * returns -1 on error, 0 otherwise */ static int _verify_mem_bind(const char *arg, char **mem_bind, - mem_bind_type_t *mem_bind_type) + mem_bind_type_t *flags) { char *buf, *p, *tok; - if (!arg) { + int bind_bits = MEM_BIND_NONE|MEM_BIND_RANK|MEM_BIND_LOCAL| + MEM_BIND_MAP|MEM_BIND_MASK; + + if (arg == NULL) { return 0; } - /* we support different memory binding names - * we also allow a verbose setting to be specified - * --mem_bind=v - * --mem_bind=rank,v - * --mem_bind=rank - * --mem_bind={MAP_MEM|MASK_MEM}:0,1,2,3,4 - */ + buf = xstrdup(arg); p = buf; /* change all ',' delimiters not followed by a digit to ';' */ @@ -586,59 +614,31 @@ static int _verify_mem_bind(const char *arg, char **mem_bind, p = buf; while ((tok = strsep(&p, ";"))) { if (strcasecmp(tok, "help") == 0) { - printf( -"Memory bind options:\n" -" --mem_bind= Bind memory to locality domains (ldom)\n" -" q[uiet] quietly bind before task runs (default)\n" -" v[erbose] verbosely report binding before task runs\n" -" no[ne] don't bind tasks to memory (default)\n" -" rank bind by task rank\n" -" local bind to memory local to processor\n" -" map_mem:<list> specify a memory binding for each task\n" -" where <list> is <cpuid1>,<cpuid2>,...<cpuidN>\n" -" mask_mem:<list> specify a memory binding mask for each tasks\n" -" where <list> is <mask1>,<mask2>,...<maskN>\n" -" help show this help message\n"); + _print_mem_bind_help(); return 1; } else if ((strcasecmp(tok, "q") == 0) || (strcasecmp(tok, "quiet") == 0)) { - *mem_bind_type &= ~MEM_BIND_VERBOSE; + *flags &= ~MEM_BIND_VERBOSE; } else if ((strcasecmp(tok, "v") == 0) || (strcasecmp(tok, "verbose") == 0)) { - *mem_bind_type |= MEM_BIND_VERBOSE; + *flags |= MEM_BIND_VERBOSE; } else if ((strcasecmp(tok, "no") == 0) || (strcasecmp(tok, "none") == 0)) { - *mem_bind_type |= MEM_BIND_NONE; - *mem_bind_type &= ~MEM_BIND_RANK; - *mem_bind_type &= ~MEM_BIND_LOCAL; - *mem_bind_type &= ~MEM_BIND_MAP; - *mem_bind_type &= ~MEM_BIND_MASK; + clear_then_set((int *)flags, bind_bits, MEM_BIND_NONE); xfree(*mem_bind); } else if (strcasecmp(tok, "rank") == 0) { - *mem_bind_type &= ~MEM_BIND_NONE; - *mem_bind_type |= MEM_BIND_RANK; - *mem_bind_type &= ~MEM_BIND_LOCAL; - *mem_bind_type &= ~MEM_BIND_MAP; - *mem_bind_type &= ~MEM_BIND_MASK; + clear_then_set((int *)flags, bind_bits, MEM_BIND_RANK); xfree(*mem_bind); } else if (strcasecmp(tok, "local") == 0) { - *mem_bind_type &= ~MEM_BIND_NONE; - *mem_bind_type &= ~MEM_BIND_RANK; - *mem_bind_type |= MEM_BIND_LOCAL; - *mem_bind_type &= ~MEM_BIND_MAP; - *mem_bind_type &= ~MEM_BIND_MASK; + clear_then_set((int *)flags, bind_bits, MEM_BIND_LOCAL); xfree(*mem_bind); } else if ((strncasecmp(tok, "map_mem", 7) == 0) || (strncasecmp(tok, "mapmem", 6) == 0)) { char *list; list = strsep(&tok, ":="); list = strsep(&tok, ":="); - *mem_bind_type &= ~MEM_BIND_NONE; - *mem_bind_type &= ~MEM_BIND_RANK; - *mem_bind_type &= ~MEM_BIND_LOCAL; - *mem_bind_type |= MEM_BIND_MAP; - *mem_bind_type &= ~MEM_BIND_MASK; + clear_then_set((int *)flags, bind_bits, MEM_BIND_MAP); xfree(*mem_bind); if (list && *list) { *mem_bind = xstrdup(list); @@ -652,11 +652,7 @@ static int _verify_mem_bind(const char *arg, char **mem_bind, char *list; list = strsep(&tok, ":="); list = strsep(&tok, ":="); - *mem_bind_type &= ~MEM_BIND_NONE; - *mem_bind_type &= ~MEM_BIND_RANK; - *mem_bind_type &= ~MEM_BIND_LOCAL; - *mem_bind_type &= ~MEM_BIND_MAP; - *mem_bind_type |= MEM_BIND_MASK; + clear_then_set((int *)flags, bind_bits, MEM_BIND_MASK); xfree(*mem_bind); if (list && *list) { *mem_bind = xstrdup(list); @@ -966,17 +962,12 @@ static void _opt_default() opt.labelio = false; opt.unbuffered = false; opt.overcommit = false; - opt.batch = false; opt.shared = (uint16_t)NO_VAL; opt.no_kill = false; opt.kill_bad_exit = false; opt.immediate = false; - opt.no_requeue = false; - opt.allocate = false; - opt.noshell = false; - opt.attach = NULL; opt.join = false; opt.max_wait = slurm_get_wait_time(); @@ -1027,8 +1018,6 @@ static void _opt_default() opt.task_prolog = NULL; opt.task_epilog = NULL; - mode = MODE_NORMAL; - gethostname_short(hostname, sizeof(hostname)); opt.ctrl_comm_ifhn = xstrdup(hostname); @@ -1086,7 +1075,6 @@ env_vars_t env_vars[] = { {"SLURM_NSOCKETS_PER_NODE",OPT_NSOCKETS,NULL, NULL }, {"SLURM_NCORES_PER_SOCKET",OPT_NCORES, NULL, NULL }, {"SLURM_NTHREADS_PER_CORE",OPT_NTHREADS,NULL, NULL }, -{"SLURM_NO_REQUEUE", OPT_INT, &opt.no_requeue, NULL }, {"SLURM_NO_ROTATE", OPT_NO_ROTATE, NULL, NULL }, {"SLURM_NPROCS", OPT_INT, &opt.nprocs, &opt.nprocs_set }, {"SLURM_OVERCOMMIT", OPT_OVERCOMMIT, NULL, NULL }, @@ -1319,14 +1307,11 @@ _get_resource_range(const char *arg, const char *what, int* min, int *max, return true; } -void set_options(const int argc, char **argv, int first) +static void set_options(const int argc, char **argv) { int opt_char, option_index = 0; struct utsname name; static struct option long_options[] = { - {"attach", required_argument, 0, 'a'}, - {"allocate", no_argument, 0, 'A'}, - {"batch", no_argument, 0, 'b'}, {"extra-node-info", required_argument, 0, 'B'}, {"cpus-per-task", required_argument, 0, 'c'}, {"constraint", required_argument, 0, 'C'}, @@ -1378,7 +1363,6 @@ void set_options(const int argc, char **argv, int first) {"job-mem", required_argument, 0, LONG_OPT_JOBMEM}, {"hint", required_argument, 0, LONG_OPT_HINT}, {"mpi", required_argument, 0, LONG_OPT_MPI}, - {"no-shell", no_argument, 0, LONG_OPT_NOSHELL}, {"tmp", required_argument, 0, LONG_OPT_TMP}, {"jobid", required_argument, 0, LONG_OPT_JOBID}, {"msg-timeout", required_argument, 0, LONG_OPT_TIMEO}, @@ -1403,7 +1387,6 @@ void set_options(const int argc, char **argv, int first) {"nice", optional_argument, 0, LONG_OPT_NICE}, {"ctrl-comm-ifhn", required_argument, 0, LONG_OPT_CTRL_COMM_IFHN}, {"multi-prog", no_argument, 0, LONG_OPT_MULTI}, - {"no-requeue", no_argument, 0, LONG_OPT_NO_REQUEUE}, {"comment", required_argument, 0, LONG_OPT_COMMENT}, {"sockets-per-node", required_argument, 0, LONG_OPT_SOCKETSPERNODE}, {"cores-per-socket", required_argument, 0, LONG_OPT_CORESPERSOCKET}, @@ -1419,7 +1402,7 @@ void set_options(const int argc, char **argv, int first) {"get-user-env", no_argument, 0, LONG_OPT_GET_USER_ENV}, {NULL, 0, 0, 0} }; - char *opt_string = "+a:AbB:c:C:d:D:e:g:Hi:IjJ:kKlm:n:N:" + char *opt_string = "+B:c:C:d:D:e:g:Hi:IjJ:kKlm:n:N:" "o:Op:P:qQr:R:st:T:uU:vVw:W:x:XZ"; struct option *optz = spank_option_table_create (long_options); @@ -1431,68 +1414,19 @@ void set_options(const int argc, char **argv, int first) if(opt.progname == NULL) opt.progname = xbasename(argv[0]); - else if(!first) - argv[0] = opt.progname; else - error("opt.progname is set but it is the first time through."); + error("opt.progname is already set."); optind = 0; while((opt_char = getopt_long(argc, argv, opt_string, optz, &option_index)) != -1) { switch (opt_char) { case (int)'?': - if(first) { - fprintf(stderr, "Try \"srun --help\" for more " - "information\n"); - exit(1); - } - break; - case (int)'a': - if(first) { - if (opt.allocate || opt.batch) { - error("can only specify one mode: " - "allocate, attach or batch."); - exit(1); - } - mode = MODE_ATTACH; - opt.attach = strdup(optarg); - } else { - error("Option '%c' can only be set " - "from srun commandline.", opt_char); - } - break; - case (int)'A': - if(first) { - if (opt.attach || opt.batch) { - error("can only specify one mode: " - "allocate, attach or batch."); - exit(1); - } - mode = MODE_ALLOCATE; - opt.allocate = true; - } else { - error("Option '%c' can only be set " - "from srun commandline.", opt_char); - } - break; - case (int)'b': - if(first) { - if (opt.allocate || opt.attach) { - error("can only specify one mode: " - "allocate, attach or batch."); - exit(1); - } - mode = MODE_BATCH; - opt.batch = true; - } else { - error("Option '%c' can only be set " - "from srun commandline.", opt_char); - } + fprintf(stderr, + "Try \"srun --help\" for more information\n"); + exit(1); break; case (int)'B': - if(!first && opt.extra_set) - break; - opt.extra_set = _verify_socket_core_thread_count( optarg, &opt.min_sockets_per_node, @@ -1511,37 +1445,24 @@ void set_options(const int argc, char **argv, int first) } break; case (int)'c': - if(!first && opt.cpus_set) - break; opt.cpus_set = true; opt.cpus_per_task = _get_int(optarg, "cpus-per-task", true); break; case (int)'C': - if(!first && opt.constraints) - break; xfree(opt.constraints); opt.constraints = xstrdup(optarg); break; case (int)'d': - if(!first && opt.slurmd_debug) - break; - opt.slurmd_debug = _get_int(optarg, "slurmd-debug", false); break; case (int)'D': - if(!first && opt.cwd_set) - break; - opt.cwd_set = true; xfree(opt.cwd); opt.cwd = xstrdup(optarg); break; case (int)'e': - if(!first && opt.efname) - break; - xfree(opt.efname); if (strncasecmp(optarg, "none", (size_t) 4) == 0) opt.efname = xstrdup("/dev/null"); @@ -1549,8 +1470,6 @@ void set_options(const int argc, char **argv, int first) opt.efname = xstrdup(optarg); break; case (int)'g': - if(!first && opt.geometry) - break; if (_verify_geometry(optarg, opt.geometry)) exit(1); break; @@ -1558,9 +1477,6 @@ void set_options(const int argc, char **argv, int first) opt.hold = true; break; case (int)'i': - if(!first && opt.ifname) - break; - xfree(opt.ifname); if (strncasecmp(optarg, "none", (size_t) 4) == 0) opt.ifname = xstrdup("/dev/null"); @@ -1574,9 +1490,6 @@ void set_options(const int argc, char **argv, int first) opt.join = true; break; case (int)'J': - if(!first && opt.job_name_set) - break; - opt.job_name_set = true; xfree(opt.job_name); opt.job_name = xstrdup(optarg); @@ -1591,8 +1504,6 @@ void set_options(const int argc, char **argv, int first) opt.labelio = true; break; case (int)'m': - if(!first && opt.distribution) - break; opt.distribution = _verify_dist_type(optarg, &opt.plane_size); if (opt.distribution == SLURM_DIST_UNKNOWN) { @@ -1602,17 +1513,11 @@ void set_options(const int argc, char **argv, int first) } break; case (int)'n': - if(!first && opt.nprocs_set) - break; - opt.nprocs_set = true; opt.nprocs = _get_int(optarg, "number of tasks", true); break; case (int)'N': - if(!first && opt.nodes_set) - break; - opt.nodes_set = _get_resource_range(optarg, "requested node count", @@ -1626,9 +1531,6 @@ void set_options(const int argc, char **argv, int first) } break; case (int)'o': - if(!first && opt.ofname) - break; - xfree(opt.ofname); if (strncasecmp(optarg, "none", (size_t) 4) == 0) opt.ofname = xstrdup("/dev/null"); @@ -1639,32 +1541,19 @@ void set_options(const int argc, char **argv, int first) opt.overcommit = true; break; case (int)'p': - if(!first && opt.partition) - break; - xfree(opt.partition); opt.partition = xstrdup(optarg); break; case (int)'P': - if(!first && opt.dependency) - break; - opt.dependency = _get_int(optarg, "dependency", true); break; case (int)'q': opt.quit_on_intr = true; break; case (int) 'Q': - if(!first && opt.quiet) - break; - opt.quiet++; break; case (int)'r': - if(!first && opt.relative) - break; - - //xfree(opt.relative); opt.relative = _get_int(optarg, "relative", false); opt.relative_set = true; break; @@ -1675,15 +1564,9 @@ void set_options(const int argc, char **argv, int first) opt.shared = 1; break; case (int)'t': - if(!first && opt.time_limit) - break; - opt.time_limit = _get_int(optarg, "time", true); break; case (int)'T': - if(!first && opt.max_threads) - break; - opt.max_threads = _get_int(optarg, "max_threads", true); pmi_server_max_threads(opt.max_threads); @@ -1692,15 +1575,10 @@ void set_options(const int argc, char **argv, int first) opt.unbuffered = true; break; case (int)'U': - if(!first && opt.account) - break; xfree(opt.account); opt.account = xstrdup(optarg); break; case (int)'v': - if(!first && _verbose) - break; - _verbose++; break; case (int)'V': @@ -1708,9 +1586,6 @@ void set_options(const int argc, char **argv, int first) exit(0); break; case (int)'w': - if(!first && opt.nodelist) - break; - xfree(opt.nodelist); opt.nodelist = xstrdup(optarg); #ifdef HAVE_BG @@ -1798,9 +1673,6 @@ void set_options(const int argc, char **argv, int first) optarg); } break; - case LONG_OPT_NOSHELL: - opt.noshell = true; - break; case LONG_OPT_TMP: opt.job_min_tmp_disk = _to_bytes(optarg); if (opt.job_min_tmp_disk < 0) { @@ -1915,12 +1787,7 @@ void set_options(const int argc, char **argv, int first) case LONG_OPT_MULTI: opt.multi_prog = true; break; - case LONG_OPT_NO_REQUEUE: - opt.no_requeue = true; - break; case LONG_OPT_COMMENT: - if(!first && opt.comment) - break; xfree(opt.comment); opt.comment = xstrdup(optarg); break; @@ -1964,26 +1831,18 @@ void set_options(const int argc, char **argv, int first) true); break; case LONG_OPT_BLRTS_IMAGE: - if(!first && opt.blrtsimage) - break; xfree(opt.blrtsimage); opt.blrtsimage = xstrdup(optarg); break; case LONG_OPT_LINUX_IMAGE: - if(!first && opt.linuximage) - break; xfree(opt.linuximage); opt.linuximage = xstrdup(optarg); break; case LONG_OPT_MLOADER_IMAGE: - if(!first && opt.mloaderimage) - break; xfree(opt.mloaderimage); opt.mloaderimage = xstrdup(optarg); break; case LONG_OPT_RAMDISK_IMAGE: - if(!first && opt.ramdiskimage) - break; xfree(opt.ramdiskimage); opt.ramdiskimage = xstrdup(optarg); break; @@ -2000,13 +1859,6 @@ void set_options(const int argc, char **argv, int first) } } - if (!first) { - if (!_opt_verify()) - exit(1); - if (_verbose > 3) - _opt_list(); - } - spank_option_table_destroy (optz); } @@ -2059,7 +1911,7 @@ static void _opt_args(int argc, char **argv) int i; char **rest = NULL; - set_options(argc, argv, 1); + set_options(argc, argv); /* When CR with memory as a CR is enabled we need to assign adequate value or check the value to opt.mem */ @@ -2104,38 +1956,33 @@ static void _opt_args(int argc, char **argv) } #endif - remote_argc = 0; + opt.argc = 0; if (optind < argc) { rest = argv + optind; - while (rest[remote_argc] != NULL) - remote_argc++; + while (rest[opt.argc] != NULL) + opt.argc++; } - remote_argv = (char **) xmalloc((remote_argc + 1) * sizeof(char *)); - for (i = 0; i < remote_argc; i++) - remote_argv[i] = xstrdup(rest[i]); - remote_argv[i] = NULL; /* End of argv's (for possible execv) */ + opt.argv = (char **) xmalloc((opt.argc + 1) * sizeof(char *)); + for (i = 0; i < opt.argc; i++) + opt.argv[i] = xstrdup(rest[i]); + opt.argv[i] = NULL; /* End of argv's (for possible execv) */ if (opt.multi_prog) { - if (remote_argc < 1) { + if (opt.argc < 1) { error("configuration file not specified"); exit(1); } - _load_multi(&remote_argc, remote_argv); + _load_multi(&opt.argc, opt.argv); } - else if (remote_argc > 0) { + else if (opt.argc > 0) { char *fullpath; - char *cmd = remote_argv[0]; - bool search_cwd = (opt.batch || opt.allocate); - int mode = (search_cwd) ? R_OK : R_OK | X_OK; - if ((fullpath = _search_path(cmd, search_cwd, mode))) { - xfree(remote_argv[0]); - remote_argv[0] = fullpath; + if ((fullpath = _search_path(opt.argv[0], false, R_OK|X_OK))) { + xfree(opt.argv[0]); + opt.argv[0] = fullpath; } } - if (!_opt_verify()) - exit(1); } /* @@ -2187,8 +2034,8 @@ static bool _opt_verify(void) if (opt.job_min_cpus < opt.cpus_per_task) opt.job_min_cpus = opt.cpus_per_task; - if ((opt.job_name == NULL) && (remote_argc > 0)) - opt.job_name = _base_name(remote_argv[0]); + if ((opt.job_name == NULL) && (opt.argc > 0)) + opt.job_name = _base_name(opt.argv[0]); if(!opt.nodelist) { if((opt.nodelist = xstrdup(getenv("SLURM_HOSTFILE")))) { @@ -2211,9 +2058,10 @@ static bool _opt_verify(void) opt.nodelist); } } - } else + } else { if (!_valid_node_list(&opt.nodelist)) exit(1); + } /* now if max is set make sure we have <= max_nodes in the * nodelist but only if it isn't arbitrary since the user has @@ -2243,72 +2091,89 @@ static bool _opt_verify(void) } - if (mode == MODE_ATTACH) { /* attach to a running job */ - if (opt.nodes_set || opt.cpus_set || opt.nprocs_set) { - error("do not specific a node allocation " - "with --attach (-a)"); - verified = false; - } - - /* if (constraints_given()) { - * error("do not specify any constraints with " - * "--attach (-a)"); - * verified = false; - *} - */ + if (opt.argc == 0) { + error("must supply remote command"); + verified = false; + } - } else { /* mode != MODE_ATTACH */ + /* check for realistic arguments */ + if (opt.nprocs <= 0) { + error("%s: invalid number of processes (-n %d)", + opt.progname, opt.nprocs); + verified = false; + } - if ((remote_argc == 0) && (mode != MODE_ALLOCATE)) { - error("must supply remote command"); - verified = false; - } + if (opt.cpus_per_task <= 0) { + error("%s: invalid number of cpus per task (-c %d)\n", + opt.progname, opt.cpus_per_task); + verified = false; + } + if ((opt.min_nodes <= 0) || (opt.max_nodes < 0) || + (opt.max_nodes && (opt.min_nodes > opt.max_nodes))) { + error("%s: invalid number of nodes (-N %d-%d)\n", + opt.progname, opt.min_nodes, opt.max_nodes); + verified = false; + } - /* check for realistic arguments */ - if (opt.nprocs <= 0) { - error("%s: invalid number of processes (-n %d)", - opt.progname, opt.nprocs); - verified = false; + /* bound max_threads/cores from ntasks_cores/sockets */ + if ((opt.max_threads_per_core <= 0) && + (opt.ntasks_per_core > 0)) { + opt.max_threads_per_core = opt.ntasks_per_core; + /* if cpu_bind_type doesn't already have a auto pref, + * choose the level based on the level of ntasks + */ + if (!(opt.cpu_bind_type & (CPU_BIND_TO_SOCKETS | + CPU_BIND_TO_CORES | + CPU_BIND_TO_THREADS))) { + opt.cpu_bind_type |= CPU_BIND_TO_CORES; } - - if (opt.cpus_per_task <= 0) { - error("%s: invalid number of cpus per task (-c %d)\n", - opt.progname, opt.cpus_per_task); - verified = false; + } + if ((opt.max_cores_per_socket <= 0) && + (opt.ntasks_per_socket > 0)) { + opt.max_cores_per_socket = opt.ntasks_per_socket; + /* if cpu_bind_type doesn't already have a auto pref, + * choose the level based on the level of ntasks + */ + if (!(opt.cpu_bind_type & (CPU_BIND_TO_SOCKETS | + CPU_BIND_TO_CORES | + CPU_BIND_TO_THREADS))) { + opt.cpu_bind_type |= CPU_BIND_TO_SOCKETS; } + } - if ((opt.min_nodes <= 0) || (opt.max_nodes < 0) || - (opt.max_nodes && (opt.min_nodes > opt.max_nodes))) { - error("%s: invalid number of nodes (-N %d-%d)\n", - opt.progname, opt.min_nodes, opt.max_nodes); - verified = false; + core_format_enable (opt.core_type); + + /* massage the numbers */ + if (opt.nodelist) { + hl = hostlist_create(opt.nodelist); + if (!hl) + fatal("memory allocation failure"); + hostlist_uniq(hl); + hl_cnt = hostlist_count(hl); + if (opt.nodes_set) + opt.min_nodes = MAX(hl_cnt, opt.min_nodes); + else { + opt.min_nodes = hl_cnt; + opt.nodes_set = true; } + } + if ((opt.nodes_set || opt.extra_set) && !opt.nprocs_set) { + /* 1 proc / node default */ + opt.nprocs = opt.min_nodes; - /* bound max_threads/cores from ntasks_cores/sockets */ - if ((opt.max_threads_per_core <= 0) && - (opt.ntasks_per_core > 0)) { - opt.max_threads_per_core = opt.ntasks_per_core; - /* if cpu_bind_type doesn't already have a auto pref, - * choose the level based on the level of ntasks - */ - if (!(opt.cpu_bind_type & (CPU_BIND_TO_SOCKETS | - CPU_BIND_TO_CORES | - CPU_BIND_TO_THREADS))) { - opt.cpu_bind_type |= CPU_BIND_TO_CORES; - } + /* 1 proc / min_[socket * core * thread] default */ + if (opt.min_sockets_per_node > 0) { + opt.nprocs *= opt.min_sockets_per_node; + opt.nprocs_set = true; } - if ((opt.max_cores_per_socket <= 0) && - (opt.ntasks_per_socket > 0)) { - opt.max_cores_per_socket = opt.ntasks_per_socket; - /* if cpu_bind_type doesn't already have a auto pref, - * choose the level based on the level of ntasks - */ - if (!(opt.cpu_bind_type & (CPU_BIND_TO_SOCKETS | - CPU_BIND_TO_CORES | - CPU_BIND_TO_THREADS))) { - opt.cpu_bind_type |= CPU_BIND_TO_SOCKETS; - } + if (opt.min_cores_per_socket > 0) { + opt.nprocs *= opt.min_cores_per_socket; + opt.nprocs_set = true; + } + if (opt.min_threads_per_core > 0) { + opt.nprocs *= opt.min_threads_per_core; + opt.nprocs_set = true; } core_format_enable (opt.core_type); @@ -2334,54 +2199,36 @@ static bool _opt_verify(void) used later */ } + } else if (opt.nodes_set && opt.nprocs_set) { - if ((opt.nodes_set || opt.extra_set) && !opt.nprocs_set) { - /* 1 proc / node default */ - opt.nprocs = opt.min_nodes; - - /* 1 proc / min_[socket * core * thread] default */ - if (opt.min_sockets_per_node > 0) { - opt.nprocs *= opt.min_sockets_per_node; - opt.nprocs_set = true; - } - if (opt.min_cores_per_socket > 0) { - opt.nprocs *= opt.min_cores_per_socket; - opt.nprocs_set = true; - } - if (opt.min_threads_per_core > 0) { - opt.nprocs *= opt.min_threads_per_core; - opt.nprocs_set = true; - } - } else if (opt.nodes_set && opt.nprocs_set) { - - /* - * make sure # of procs >= min_nodes - */ - if (opt.nprocs < opt.min_nodes) { - - info ("Warning: can't run %d processes on %d " - "nodes, setting nnodes to %d", - opt.nprocs, opt.min_nodes, opt.nprocs); - - opt.min_nodes = opt.nprocs; - if (opt.max_nodes - && (opt.min_nodes > opt.max_nodes) ) - opt.max_nodes = opt.min_nodes; - if (hl_cnt > opt.min_nodes) { - int del_cnt, i; - char *host; - del_cnt = hl_cnt - opt.min_nodes; - for (i=0; i<del_cnt; i++) { - host = hostlist_pop(hl); - free(host); - } - hostlist_ranged_string(hl, strlen(opt.nodelist)+1, - opt.nodelist); + /* + * make sure # of procs >= min_nodes + */ + if (opt.nprocs < opt.min_nodes) { + + info ("Warning: can't run %d processes on %d " + "nodes, setting nnodes to %d", + opt.nprocs, opt.min_nodes, opt.nprocs); + + opt.min_nodes = opt.nprocs; + if (opt.max_nodes + && (opt.min_nodes > opt.max_nodes) ) + opt.max_nodes = opt.min_nodes; + if (hl_cnt > opt.min_nodes) { + int del_cnt, i; + char *host; + del_cnt = hl_cnt - opt.min_nodes; + for (i=0; i<del_cnt; i++) { + host = hostlist_pop(hl); + free(host); } + hostlist_ranged_string(hl, strlen(opt.nodelist)+1, + opt.nodelist); } + } + + } /* else if (opt.nprocs_set && !opt.nodes_set) */ - } /* else if (opt.nprocs_set && !opt.nodes_set) */ - } if (hl) hostlist_destroy(hl); @@ -2418,11 +2265,6 @@ static bool _opt_verify(void) if ((opt.egid != (gid_t) -1) && (opt.egid != opt.gid)) opt.gid = opt.egid; - if (opt.noshell && !opt.allocate) { - error ("--no-shell only valid with -A (--allocate)"); - verified = false; - } - if (opt.propagate && parse_rlimits( opt.propagate, PROPAGATE_RLIMITS)) { error( "--propagate=%s is not valid.", opt.propagate ); verified = false; @@ -2589,8 +2431,8 @@ print_commandline() char buf[256]; buf[0] = '\0'; - for (i = 0; i < remote_argc; i++) - snprintf(buf, 256, "%s", remote_argv[i]); + for (i = 0; i < opt.argc; i++) + snprintf(buf, 256, "%s", opt.argv[i]); return xstrdup(buf); } @@ -2654,13 +2496,9 @@ static void _opt_list() info("verbose : %d", _verbose); info("slurmd_debug : %d", opt.slurmd_debug); info("immediate : %s", tf_(opt.immediate)); - info("no-requeue : %s", tf_(opt.no_requeue)); info("label output : %s", tf_(opt.labelio)); info("unbuffered IO : %s", tf_(opt.unbuffered)); - info("allocate : %s", tf_(opt.allocate)); - info("attach : `%s'", opt.attach); info("overcommit : %s", tf_(opt.overcommit)); - info("batch : %s", tf_(opt.batch)); info("threads : %d", opt.max_threads); if (opt.time_limit == INFINITE) info("time_limit : INFINITE"); @@ -2736,8 +2574,8 @@ static void _usage(void) " [-c ncpus] [-r n] [-p partition] [--hold] [-t minutes]\n" " [-D path] [--immediate] [--overcommit] [--no-kill]\n" " [--share] [--label] [--unbuffered] [-m dist] [-J jobname]\n" -" [--jobid=id] [--batch] [--verbose] [--slurmd_debug=#]\n" -" [--core=type] [-T threads] [-W sec] [--attach] [--join] \n" +" [--jobid=id] [--verbose] [--slurmd_debug=#]\n" +" [--core=type] [-T threads] [-W sec] \n" " [--contiguous] [--mincpus=n] [--mem=MB] [--tmp=MB] [-C list]\n" " [--mpi=type] [--account=name] [--dependency=jobid]\n" " [--kill-on-bad-exit] [--propagate[=rlimits] [--comment=name]\n" @@ -2752,7 +2590,7 @@ static void _usage(void) " [--mail-type=type] [--mail-user=user] [--nice[=value]]\n" " [--prolog=fname] [--epilog=fname]\n" " [--task-prolog=fname] [--task-epilog=fname]\n" - " [--ctrl-comm-ifhn=addr] [--multi-prog] [--no-requeue]\n" + " [--ctrl-comm-ifhn=addr] [--multi-prog]\n" " [-w hosts...] [-x hosts...] executable [args...]\n"); } @@ -2817,16 +2655,6 @@ static void _help(void) " --multi-prog if set the program name specified is the\n" " configuration specification for multiple programs\n" " --get-user-env used by Moab. See srun man page.\n" -" --no-requeue if set, do not permit the job to be requeued\n" -"\n" -"Allocate only:\n" -" -A, --allocate allocate resources and spawn a shell\n" -" --no-shell don't spawn shell in allocate mode\n" -"\n" -"Attach to running job:\n" -" -a, --attach=jobid attach to running job with specified id\n" -" -j, --join when used with --attach, allow forwarding of\n" -" signals and stdin.\n" "\n" "Constraint options:\n" " --mincpus=n minimum number of cpus per node\n" diff --git a/src/srun/opt.h b/src/srun/opt.h index 66367495dd921b6fecff090659025a1754252cec..4053916e8784eb7f2c654195214828de58922b9f 100644 --- a/src/srun/opt.h +++ b/src/srun/opt.h @@ -58,20 +58,8 @@ #define INT_UNASSIGNED ((int)-1) /* global variables relating to user options */ -extern char **remote_argv; -extern int remote_argc; extern int _verbose; -/* mutually exclusive modes for srun */ -enum modes { - MODE_UNKNOWN = 0, - MODE_NORMAL = 1, - MODE_IMMEDIATE = 2, - MODE_ATTACH = 3, - MODE_ALLOCATE = 4, - MODE_BATCH = 5 -}; - extern enum modes mode; #define format_task_dist_states(t) (t == SLURM_DIST_BLOCK) ? "block" : \ @@ -142,7 +130,6 @@ typedef struct srun_options { int slurmd_debug; /* --slurmd-debug, -D */ core_format_t core_type;/* --core= */ - char *attach; /* --attach=id -a id */ bool join; /* --join, -j */ /* no longer need these, they are set globally : */ @@ -154,13 +141,9 @@ typedef struct srun_options { bool hold; /* --hold, -H */ bool labelio; /* --label-output, -l */ bool unbuffered; /* --unbuffered, -u */ - bool allocate; /* --allocate, -A */ - bool noshell; /* --noshell */ bool overcommit; /* --overcommit, -O */ - bool batch; /* --batch, -b */ bool no_kill; /* --no-kill, -k */ bool kill_bad_exit; /* --kill-on-bad-exit, -K */ - bool no_requeue; /* --no-requeue */ uint16_t shared; /* --share, -s */ int max_wait; /* --wait, -W */ bool quit_on_intr; /* --quit-on-interrupt, -q */ @@ -212,6 +195,8 @@ typedef struct srun_options { char *mail_user; /* --mail-user */ char *ctrl_comm_ifhn; /* --ctrl-comm-ifhn */ bool get_user_env; /* --get-user-env */ + int argc; /* length of argv array */ + char **argv; /* left over on command line */ } opt_t; extern opt_t opt; @@ -237,8 +222,4 @@ extern opt_t opt; */ int initialize_and_process_args(int argc, char *argv[]); -/* set options based upon commandline args */ -void set_options(const int argc, char **argv, int first); - - #endif /* _HAVE_OPT_H */ diff --git a/src/srun/reattach.c b/src/srun/reattach.c deleted file mode 100644 index b74dddb5653f7310fc01e82865f0f1b0ebfcac1c..0000000000000000000000000000000000000000 --- a/src/srun/reattach.c +++ /dev/null @@ -1,545 +0,0 @@ -/****************************************************************************\ - * src/srun/reattach.c - reattach to a running job - * $Id$ - ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Mark Grondona <grondona@llnl.gov>. - * UCRL-CODE-226842. - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under - * certain conditions as described in each individual source file, and - * distribute linked combinations including the two. You must obey the GNU - * General Public License in all respects for all of the code used other than - * OpenSSL. If you modify file(s) with this exception, you may extend this - * exception to your version of the file(s), but you are not obligated to do - * so. If you do not wish to do so, delete this exception statement from your - * version. If you delete this exception statement from all source files in - * the program, then also delete it here. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -\*****************************************************************************/ - -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include <errno.h> -#include <signal.h> -#include <string.h> -#include <stdlib.h> -#include <unistd.h> -#include <sys/param.h> - -#include "src/common/xmalloc.h" -#include "src/common/xstring.h" -#include "src/common/xsignal.h" -#include "src/common/log.h" -#include "src/common/list.h" -#include "src/common/macros.h" -#include "src/common/hostlist.h" -#include "src/common/slurm_protocol_api.h" -#include "src/common/read_config.h" -#include "src/common/forward.h" - -#include "src/srun/srun_job.h" -#include "src/srun/launch.h" -#include "src/srun/opt.h" -#include "src/srun/msg.h" -#include "src/srun/srun.h" -#include "src/srun/signals.h" - - -/* number of active threads */ -static pthread_mutex_t active_mutex = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t active_cond = PTHREAD_COND_INITIALIZER; -static int active = 0; - -static bool invalid_user = false; - -typedef enum {THD_NEW, THD_ACTIVE, THD_DONE, THD_FAILED} state_t; - -typedef struct thd { - pthread_t thread; /* thread ID */ - pthread_attr_t attr; /* thread attributes */ - state_t state; /* thread state */ - slurm_msg_t *msg; - srun_job_t *job; - uint32_t nodeid; -} thd_t; - -static void _p_reattach(slurm_msg_t *req, srun_job_t *job); -static void *_p_reattach_task(void *args); - -typedef struct _srun_step { - uint32_t jobid; - uint32_t stepid; - uint32_t ntasks; - char *nodes; - char *name; - bool complete_job; -} srun_step_t; - -static void -_srun_step_destroy(srun_step_t *s) -{ - if (s->name) - xfree(s->name); - if (s->nodes) - xfree(s->nodes); - xfree(s); -} - -static srun_step_t * -_srun_step_create(uint32_t jobid, uint32_t stepid, char *name) -{ - srun_step_t *s = xmalloc(sizeof(*s)); - s->jobid = jobid; - s->stepid = stepid; - s->ntasks = 0; - s->nodes = NULL; - s->name = NULL; - - s->complete_job = false; - - if (name == NULL) - return s; - s->name = xstrdup(name); - return s; -} - -static char * -_next_tok(char *sep, char **str) -{ - char *tok; - - /* push str past any leading separators */ - while ((**str != '\0') && (strchr(sep, **str) != '\0')) - (*str)++; - - if (**str == '\0') - return NULL; - - /* assign token ptr */ - tok = *str; - - /* push str past token and leave pointing to first separator */ - while ((**str != '\0') && (strchr(sep, **str) == '\0')) - (*str)++; - - /* nullify consecutive separators and push str beyond them */ - while ((**str != '\0') && (strchr(sep, **str) != '\0')) - *(*str)++ = '\0'; - - return tok; -} - - -static List -_step_list_create(char *steplist) -{ - List l = NULL; - char *str = NULL; - char *orig = NULL; - char *tok = NULL; - uint32_t jobid, stepid; - - if (steplist == NULL) - return NULL; - - orig = str = xstrdup(steplist); - - l = list_create((ListDelF)_srun_step_destroy); - - while ((tok = _next_tok(",", &str))) { - char *cur = tok; - char *p = strchr(tok, '.'); - char *q = NULL; - - if (p) *(p++) = '\0'; - - jobid = strtoul(tok, &q, 10); - - if (q == tok) { - error("Invalid jobid: `%s'", cur); - goto error; - } - - stepid = (p && *p) ? strtoul(p, &q, 10) : NO_VAL; - - if ((q == p) || (*q != '\0')) { - error("Invalid job step id: `%s'", cur); - goto error; - } - - list_append(l, _srun_step_create(jobid, stepid, cur)); - } - - xfree(orig); - return l; - - error: - xfree(orig); - list_destroy(l); - return NULL; - -} - -static int -_get_job_info(srun_step_t *s) -{ - int i, rc = -1; - job_info_msg_t *resp = NULL; - job_info_t *job = NULL; - hostlist_t hl; - - s->nodes = NULL; - - if (slurm_load_jobs((time_t) 0, &resp, 1) < 0) { - error("Unable to load jobs: %m"); - goto done; - } - - for (i = 0; i < resp->record_count; i++) { - job = &resp->job_array[i]; - if (job->job_id == s->jobid) - break; - job = NULL; - } - - if (job == NULL) { - error ("Unable to find job %u", s->jobid); - goto done; - } - - if ((job->job_state != JOB_RUNNING) - && (job->job_state != JOB_SUSPENDED)) { - error ("Cannot attach to job %d in state %s", - job->job_id, job_state_string(job->job_state)); - goto done; - } - - if (!job->batch_flag) { - rc = 0; - goto done; - } - - if (!(hl = hostlist_create(job->nodes))) { - error ("Unable to create hostlist from `%s'", job->nodes); - goto done; - } - s->nodes = hostlist_shift(hl); - hostlist_destroy(hl); - - s->ntasks = 1; - rc = 0; - - done: - if (resp) - slurm_free_job_info_msg(resp); - return rc; -} - -static void -_get_step_info(srun_step_t *s) -{ - uid_t my_uid; - job_step_info_response_msg_t *resp = NULL; - - xassert(s->stepid != NO_VAL); - - if (slurm_get_job_steps((time_t) 0, s->jobid, s->stepid, &resp, 1) - < 0) { - error("Unable to get step information for %u.%u: %m", - s->jobid, s->stepid); - goto done; - } - if (resp->job_step_count == 0) { - error("No nodes in %u.%u", s->jobid, s->stepid); - s->ntasks = 0; - goto done; - } - - invalid_user = false; - if ((my_uid = getuid()) != 0) { /* not user root */ - if (my_uid != resp->job_steps->user_id) { - error("Invalid user id"); - invalid_user = true; - /* We let the request continue and log the - * event in SlurmdLog for security purposes */ - } - } - s->nodes = xstrdup(resp->job_steps->nodes); - s->ntasks = resp->job_steps->num_tasks; - - done: - if (resp) - slurm_free_job_step_info_response_msg(resp); - return; -} - -static void -_get_attach_info(srun_step_t *s) -{ - if (s->stepid == NO_VAL) { - if (_get_job_info(s) < 0) - return; - - /* If job was not a batch job, try step 0 - */ - if (s->nodes == NULL) { - s->stepid = 0; - _get_step_info(s); - } - - } else { - _get_step_info(s); - } -} - -static int -_attach_to_job(srun_job_t *job) -{ - int i; - reattach_tasks_request_msg_t *req = NULL; - slurm_msg_t *msg = NULL; - hostlist_t hl = NULL; - char *name = NULL; - - req = xmalloc(job->nhosts * sizeof(reattach_tasks_request_msg_t)); - msg = xmalloc(job->nhosts * sizeof(slurm_msg_t)); - - debug("Going to attach to job %u.%u", job->jobid, job->stepid); - - hl = hostlist_create(job->step_layout->node_list); - for (i = 0; i < job->nhosts; i++) { - reattach_tasks_request_msg_t *r = &req[i]; - slurm_msg_t *m = &msg[i]; - - r->job_id = job->jobid; - r->job_step_id = job->stepid; - r->num_io_port = 1; - r->io_port = (uint16_t *)xmalloc(sizeof(uint16_t)); - r->io_port[0] = job->client_io->listenport[ - i%job->client_io->num_listen]; - r->num_resp_port = 1; - r->resp_port = (uint16_t *)xmalloc(sizeof(uint16_t)); - r->resp_port[0] = ntohs(job->jaddr[i%job->njfds].sin_port); - r->cred = job->cred; - slurm_msg_t_init(m); - m->data = r; - m->msg_type = REQUEST_REATTACH_TASKS; - name = hostlist_shift(hl); - if(!name) { - error("hostlist incomplete for this job request"); - hostlist_destroy(hl); - return SLURM_ERROR; - } - if(slurm_conf_get_addr(name, &m->address) - == SLURM_ERROR) { - error("_init_task_layout: can't get addr for " - "host %s", name); - free(name); - hostlist_destroy(hl); - return SLURM_ERROR; - } - free(name); - /* memcpy(&m->address, &job->step_layout->node_addr[i], */ -/* sizeof(slurm_addr)); */ - } - hostlist_destroy(hl); - _p_reattach(msg, job); - - return SLURM_SUCCESS; -} - -static void -_p_reattach(slurm_msg_t *msg, srun_job_t *job) -{ - int i; - thd_t *thd = xmalloc(job->nhosts * sizeof(thd_t)); - - for (i = 0; i < job->nhosts; i++) { - - slurm_mutex_lock(&active_mutex); - while (active >= opt.max_threads) { - pthread_cond_wait(&active_cond, &active_mutex); - } - active++; - slurm_mutex_unlock(&active_mutex); - - thd[i].msg = &msg[i]; - thd[i].job = job; - thd[i].nodeid = i; - - slurm_attr_init(&thd[i].attr); - if (pthread_attr_setdetachstate(&thd[i].attr, - PTHREAD_CREATE_DETACHED ) < 0) - fatal("pthread_attr_setdetachstate: %m"); - - if (pthread_create( &thd[i].thread, &thd[i].attr, - _p_reattach_task, (void *) &thd[i])) { - error("pthread_create: %m"); - _p_reattach_task((void *) &thd[i]); - } - slurm_attr_destroy(&thd[i].attr); - - } - - slurm_mutex_lock(&active_mutex); - while (active > 0) - pthread_cond_wait(&active_cond, &active_mutex); - slurm_mutex_unlock(&active_mutex); - - xfree(thd); -} - -static void * -_p_reattach_task(void *arg) -{ - thd_t *t = (thd_t *) arg; - int rc = 0; - char *host = nodelist_nth_host(t->job->step_layout->node_list, - t->nodeid); - - t->state = THD_ACTIVE; - debug3("sending reattach request to %s", host); - - rc = slurm_send_only_node_msg(t->msg); - if (rc < 0) { - error("reattach: %s: %m", host); - t->state = THD_FAILED; - t->job->host_state[t->nodeid] = SRUN_HOST_REPLIED; - } else { - t->state = THD_DONE; - t->job->host_state[t->nodeid] = SRUN_HOST_UNREACHABLE; - } - free(host); - slurm_mutex_lock(&active_mutex); - active--; - pthread_cond_signal(&active_cond); - slurm_mutex_unlock(&active_mutex); - - return NULL; -} - - -int reattach() -{ - List steplist = _step_list_create(opt.attach); - srun_step_t *s = NULL; - srun_job_t *job = NULL; - slurm_step_io_fds_t fds = SLURM_STEP_IO_FDS_INITIALIZER; - - if ((steplist == NULL) || (list_count(steplist) == 0)) { - info("No job/steps in attach"); - exit(1); - } - - if (list_count(steplist) > 1) - info("Warning: attach to multiple jobs/steps not supported"); - s = list_peek(steplist); - - _get_attach_info(s); - - if (!opt.join) - opt.ifname = "none"; - - if ((opt.nodelist = s->nodes) == NULL) - exit(1); - - if ((opt.nprocs = s->ntasks) == 0) - exit(1); - - /* - * Indicate that nprocs has been manually set - */ - opt.nprocs_set = true; - - if (!(job = job_create_noalloc())) - exit(1); - - job->jobid = s->jobid; - job->stepid = s->stepid; - - if (job->stepid == NO_VAL) { - char *new_argv0 = NULL; - xstrfmtcat(new_argv0, "attach[%d]", job->jobid); - log_set_argv0(new_argv0); - } - - /* - * mask and handle certain signals iff we are "joining" with - * the job in question. If opt.join is off, attached srun is in - * "read-only" mode and cannot forward stdin/signals. - */ - if (opt.join) - sig_setup_sigmask(); - - if (msg_thr_create(job) < 0) { - error("Unable to create msg thread: %m"); - exit(1); - } - - srun_set_stdio_fds(job, &fds); - job->client_io = client_io_handler_create(fds, - job->step_layout->task_cnt, - job->step_layout->node_cnt, - job->cred, - opt.labelio); - if (!job->client_io - || (client_io_handler_start(job->client_io) != SLURM_SUCCESS)) - job_fatal(job, "failed to start IO handler"); - - if (opt.join && sig_thr_create(job) < 0) { - error("Unable to create signals thread: %m"); - } - - _attach_to_job(job); - - if (invalid_user) - exit(1); - - slurm_mutex_lock(&job->state_mutex); - while (job->state < SRUN_JOB_TERMINATED) { - pthread_cond_wait(&job->state_cond, &job->state_mutex); - } - slurm_mutex_unlock(&job->state_mutex); - - if (job->state == SRUN_JOB_FAILED) - info("Job terminated abnormally."); - - /* - * Signal the IO thread to shutdown, which will stop - * the listening socket and file read (stdin) event - * IO objects, but allow file write (stdout) objects to - * complete any writing that remains. - */ - debug("Waiting for IO thread"); - if (client_io_handler_finish(job->client_io) != SLURM_SUCCESS) - error ("IO handler did not finish correctly (reattach): %m"); - client_io_handler_destroy(job->client_io); - - /* kill msg server thread */ - pthread_kill(job->jtid, SIGHUP); - - /* _complete_job(job); */ - - exit(0); -} diff --git a/src/srun/reattach.h b/src/srun/reattach.h deleted file mode 100644 index 04882a3c0f1a2b27f663e533611f41b85aa3d5d2..0000000000000000000000000000000000000000 --- a/src/srun/reattach.h +++ /dev/null @@ -1,46 +0,0 @@ -/*****************************************************************************\ - * src/srun/reattach.h support for re/attach to running jobs in slurm - ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Mark Grondona <mgrondona@llnl.gov>. - * UCRL-CODE-226842. - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under - * certain conditions as described in each individual source file, and - * distribute linked combinations including the two. You must obey the GNU - * General Public License in all respects for all of the code used other than - * OpenSSL. If you modify file(s) with this exception, you may extend this - * exception to your version of the file(s), but you are not obligated to do - * so. If you do not wish to do so, delete this exception statement from your - * version. If you delete this exception statement from all source files in - * the program, then also delete it here. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -\*****************************************************************************/ - -#ifndef _REATTACH_H -#define _REATTACH_H - -/* reattach to running job, if possible. - * jobid/stepid to attach to are held in srun options "opt" - */ -int reattach(void); - -#endif /* !_REATTACH_H */ diff --git a/src/srun/signals.c b/src/srun/signals.c index 3a40abbf77cc70b37c21f5d8b198756e337c6db0..df308c1e82f4c003f6dd36f87aa9de05501e720f 100644 --- a/src/srun/signals.c +++ b/src/srun/signals.c @@ -143,8 +143,7 @@ _handle_intr(srun_job_t *job, time_t *last_intr, time_t *last_intr_sent) if (((time(NULL) - *last_intr) > 1) && !opt.disable_status) { info("interrupt (one more within 1 sec to abort)"); - if (mode != MODE_ATTACH) - report_task_status(job); + report_task_status(job); *last_intr = time(NULL); } else { /* second Ctrl-C in half as many seconds */ update_job_state(job, SRUN_JOB_CANCELLED); diff --git a/src/srun/srun.c b/src/srun/srun.c index c112e0fcd494cf96ec354a44841f12960ce5fbc0..e414bd1fa9825e6ee1548b392c2f8d9a663e9ef7 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -87,8 +87,7 @@ #include "src/srun/msg.h" #include "src/srun/opt.h" #include "src/srun/sigstr.h" -#include "src/srun/reattach.h" -#include "src/srun/attach.h" +#include "src/srun/debugger.h" #include "src/srun/srun.h" #include "src/srun/signals.h" @@ -105,21 +104,12 @@ mpi_plugin_client_info_t mpi_job_info[1]; * forward declaration of static funcs */ static void _print_job_information(resource_allocation_response_msg_t *resp); -static char *_build_script (const char *argv0, char *pathname, int file_type); -static char *_get_shell (void); -static void _send_options(const int argc, char **argv); -static void _get_options (const char *buffer); -static char *_get_token(char *buf_ptr); -static int _is_file_text (char *, char**); -static int _run_batch_job (const char *argv0); -static int _run_job_script(srun_job_t *job, env_t *env); static void _set_prio_process_env(void); static int _set_rlimit_env(void); static int _set_umask_env(void); static char *_uint16_array_to_str(int count, const uint16_t *array); static void _switch_standalone(srun_job_t *job); static int _become_user (void); -static int _print_script_exit_status(const char *argv0, int status); static void _run_srun_prolog (srun_job_t *job); static void _run_srun_epilog (srun_job_t *job); static int _run_srun_script (srun_job_t *job, char *script); @@ -184,11 +174,10 @@ int srun(int ac, char **av) log_alter(logopt, 0, NULL); } - if (!opt.allocate) { - (void) _set_rlimit_env(); - _set_prio_process_env(); - (void) _set_umask_env(); - } + (void) _set_rlimit_env(); + _set_prio_process_env(); + (void) _set_umask_env(); + /* Set up slurmctld message handler */ slurmctld_msg_init(); @@ -204,69 +193,18 @@ int srun(int ac, char **av) info("allocation success"); exit (0); - } else if (opt.batch) { - /* allow binding with batch submissions */ - env->distribution = opt.distribution; - env->cpu_bind_type = opt.cpu_bind_type; - env->cpu_bind = opt.cpu_bind; - env->mem_bind_type = opt.mem_bind_type; - env->mem_bind = opt.mem_bind; - setup_env(env); - - if (_run_batch_job(av[0]) < 0) - exit (1); - exit (0); - } else if (opt.no_alloc) { info("do not allocate resources"); sig_setup_sigmask(); job = job_create_noalloc(); _switch_standalone(job); - } else if (opt.allocate) { - sig_setup_sigmask(); - if ( !(resp = allocate_nodes()) ) - exit(1); - if (opt.noshell) { - fprintf (stdout, "SLURM_JOBID=%u\n", resp->job_id); - exit (0); - } - if (_become_user () < 0) - info ("Warning: unable to assume uid=%lu\n", opt.uid); - _print_job_information(resp); - - job = job_create_allocation(resp); - if(!job) - exit(1); - - job->step_layout = - fake_slurm_step_layout_create(resp->node_list, - resp->cpus_per_node, - resp->cpu_count_reps, - resp->node_cnt, 0); - if(!job->step_layout) - exit(1); - if (msg_thr_create(job) < 0) - job_fatal(job, "Unable to create msg thread"); - exitcode = _run_job_script(job, env); - srun_job_destroy(job,exitcode); - - debug ("Spawned srun shell terminated"); - xfree(env->task_count); - xfree(env); - exit (exitcode); - } else if ((resp = existing_allocation())) { job_id = resp->job_id; if (opt.alloc_nodelist == NULL) opt.alloc_nodelist = xstrdup(resp->node_list); slurm_free_resource_allocation_response_msg(resp); - if (opt.allocate) { - error("job %u already has an allocation", - job_id); - exit(1); - } job = job_step_create_allocation(job_id); @@ -278,9 +216,6 @@ int srun(int ac, char **av) if (create_job_step(job) < 0) exit(1); - } else if (mode == MODE_ATTACH) { - reattach(); - exit (0); } else { /* Combined job allocation and job step launch */ #ifdef HAVE_FRONT_END @@ -422,10 +357,11 @@ int srun(int ac, char **av) * We want to make sure we get the correct state of the job * and not finish before all the messages have been sent. */ - if (job->state == SRUN_JOB_FAILED) - close(job->forked_msg->msg_par->msg_pipe[1]); +/* FIXME - need a new way to tell the message thread to shutdown */ +/* if (job->state == SRUN_JOB_FAILED) */ +/* close(job->forked_msg->msg_par->msg_pipe[1]); */ debug("Waiting for message thread"); - if (pthread_join(job->jtid, NULL) < 0) + if (pthread_join(job->msg_tid, NULL) < 0) error ("Waiting on message thread: %m"); debug("done"); @@ -479,8 +415,8 @@ static int _call_spank_local_user (srun_job_t *job) info->jobid = job->jobid; info->stepid = job->stepid; info->step_layout = job->step_layout; - info->argc = remote_argc; - info->argv = remote_argv; + info->argc = opt.argc; + info->argv = opt.argv; return spank_local_user(info); } @@ -582,301 +518,6 @@ _print_job_information(resource_allocation_response_msg_t *resp) verbose("%s",job_details); } - -/* submit a batch job and return error code */ -static int -_run_batch_job(const char *argv0) -{ - int file_type, retries; - int rc = SLURM_SUCCESS; - job_desc_msg_t *req; - submit_response_msg_t *resp; - char *script; - void (*log_msg) (const char *fmt, ...) = (void (*)) &error; - - if ((remote_argc == 0) || (remote_argv[0] == NULL)) - return SLURM_ERROR; - - file_type = _is_file_text (remote_argv[0], NULL); - - /* if (file_type == TYPE_NOT_TEXT) { - * error ("file %s is not script", remote_argv[0]); - * return SLURM_ERROR; - * } - */ - - if ((script = _build_script (argv0, remote_argv[0], file_type)) - == NULL) { - error ("unable to build script from file %s", remote_argv[0]); - return SLURM_ERROR; - } - - if (!(req = job_desc_msg_create_from_opts (script))) - fatal ("Unable to create job request"); - - /* Do not re-use existing job id from environment variable - * when submitting new job from within a running job */ - if (!opt.jobid_set) - req->job_id = NO_VAL; - - retries = 0; - while ( (retries < MAX_RETRIES) - && (rc = slurm_submit_batch_job(req, &resp)) < 0) { - - if (errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) - return (error("Unable to submit batch job: %m")); - - (*log_msg) ("Controller not responding, retrying..."); - log_msg = &debug; - sleep (++retries); - } - - - if (rc == SLURM_SUCCESS) { - if (resp->step_id == NO_VAL) - info ("jobid %u submitted",resp->job_id); - else - info ("jobid %u.%u submitted",resp->job_id, - resp->step_id); - if (resp->error_code) { - if (opt.immediate) { - error("Job failed: %s", - slurm_strerror(resp->error_code)); - rc = resp->error_code; - } else { - info("Warning: %s", - slurm_strerror(resp->error_code)); - } - } - slurm_free_submit_response_response_msg (resp); - } - - job_desc_msg_destroy (req); - xfree (script); - - return (rc); -} - -static void _send_options(const int argc, char **argv) -{ - int i; - - set_options(argc, argv, 0); - for(i=1; i<argc; i++) { - debug3("argv[%d] = %s.",i,argv[i]); - xfree(argv[i]); - } -} - -/* _get_shell - return a string containing the default shell for this user - * NOTE: This function is NOT reentrant (see getpwuid_r if needed) */ -static char * -_get_shell (void) -{ - struct passwd *pw_ent_ptr; - - pw_ent_ptr = getpwuid (opt.uid); - if ( ! pw_ent_ptr ) { - pw_ent_ptr = getpwnam( "nobody" ); - info( "warning - no user information for user %d", opt.uid ); - } - return pw_ent_ptr->pw_shell; -} - -static char *_get_token(char *buf_ptr) -{ - int i, token_size = 0; - char *token; - - for (i=1; (buf_ptr[i] != '\n') && (buf_ptr[i] != '\0'); - i++) { - if (isspace(buf_ptr[i])) - break; - } - token_size = i; - - token = xmalloc(token_size + 1); - strncpy(token, buf_ptr, token_size); - return token; -} - -/* _get_opts - gather options put in user script. Used for batch scripts. */ -static void -_get_options (const char *buffer) -{ - int argc = 1; - char *argv[MAX_ENTRIES]; - char *buf_loc = (char *) buffer; - - while ((buf_loc = strstr(buf_loc, "#SLURM"))) { - buf_loc += 6; - /* find the tokens and move them to argv */ - for ( ; ((buf_loc[0] != '\n') && (buf_loc[0] != '\0')); - buf_loc++) { - if (isspace(buf_loc[0])) - continue; - argv[argc] = _get_token(buf_loc); - buf_loc += (strlen(argv[argc]) - 1); - argc++; - } - } - if(argc > 1) - _send_options(argc, argv); - return; -} - -#define F 0 /* char never appears in text */ -#define T 1 /* character appears in plain ASCII text */ -#define I 2 /* character appears in ISO-8859 text */ -#define X 3 /* character appears in non-ISO extended ASCII */ -static char text_chars[256] = { - /* BEL BS HT LF FF CR */ - F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ - /* ESC */ - F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ - T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ - T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ - T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ - T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ - T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ - T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ - /* NEL */ - X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ - X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ - I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ - I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ - I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ - I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ - I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ - I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ -}; - - -/* _is_file_text - determine if specified file is a script - * shell_ptr - if not NULL, set to pointer to pathname of specified shell - * (if any, ie. return code of 2) - * return 0 if the specified file can not be read or does not contain text - * returns 2 if file contains text starting with "#!", otherwise - * returns 1 if file contains text, but lacks "#!" header - */ -static int -_is_file_text (char *fname, char **shell_ptr) -{ - int buf_size, fd, i; - int rc = 1; /* initially assume the file contains text */ - unsigned char buffer[8192]; - - if (fname[0] != '/') { - info("warning: %s not found in local path", fname); - return 0; - } - - fd = open(fname, O_RDONLY); - if (fd < 0) { - error ("Unable to open file %s: %m", fname); - return 0; - } - - buf_size = read (fd, buffer, sizeof (buffer)); - if (buf_size < 0) { - error ("Unable to read file %s: %m", fname); - rc = 0; - } - (void) close (fd); - - for (i=0; i<buf_size; i++) { - if (((int) text_chars[buffer[i]] != T) - && ((int) text_chars[buffer[i]] != I)) { - rc = 0; - break; - } - } - - if ((rc == 1) && (buf_size > 2)) { - if ((buffer[0] == '#') && (buffer[1] == '!')) - rc = 2; - } - - if ((rc == 2) && shell_ptr) { - shell_ptr[0] = xmalloc (sizeof (buffer)); - for (i=2; i<sizeof(buffer); i++) { - if (iscntrl (buffer[i])) { - shell_ptr[0][i-2] = '\0'; - break; - } else - shell_ptr[0][i-2] = buffer[i]; - } - if (i == sizeof(buffer)) { - error ("shell specified in script too long, not used"); - xfree (shell_ptr[0]); - shell_ptr[0] = NULL; - } - } - - return rc; -} - -/* allocate and build a string containing a script for a batch job */ -static char * -_build_script (const char *argv0, char *fname, int file_type) -{ - cbuf_t cb = cbuf_create(512, 1048576); - int fd = -1; - int i = 0; - char *buffer = NULL; - - if (file_type != 0) { - if ((fd = open(fname, O_RDONLY)) < 0) { - error ("Unable to open file %s: %m", fname); - return NULL; - } - } - - if (file_type != TYPE_SCRIPT) { - xstrfmtcat(buffer, "#!%s\n", _get_shell()); - if (file_type == 0) { - xstrfmtcat(buffer, "%s ", argv0); /* path to srun */ - for (i = 0; i < remote_argc; i++) - xstrfmtcat(buffer, "%s ", remote_argv[i]); - xstrcatchar(buffer, '\n'); - } - } - - if (file_type != 0) { - int len = buffer ? strlen(buffer) : 0; - int size; - - while ((size = cbuf_write_from_fd(cb, fd, -1, NULL)) > 0) - ; - - if (size < 0) { - error ("unable to read %s: %m", fname); - cbuf_destroy(cb); - return NULL; - } - - cbuf_write(cb, "\0", 1, NULL); - - xrealloc(buffer, cbuf_used(cb) + len +1); - - cbuf_read(cb, buffer+len, cbuf_used(cb)); - - if (close(fd) < 0) - error("close: %m"); - } - - cbuf_destroy(cb); - - _get_options(buffer); - - if (strlen(buffer) >= 0xffff) { - error("Job script exceeds size supported by slurm"); - xfree(buffer); - } - - return buffer; -} - /* Set SLURM_UMASK environment variable with current state */ static int _set_umask_env(void) { @@ -1002,111 +643,6 @@ static int _set_rlimit_env(void) return rc; } -static int -_print_script_exit_status(const char *argv0, int status) -{ - char *corestr = ""; - int exitcode = 0; - - if (status == 0) { - verbose("%s: Done", argv0); - return exitcode; - } - -#ifdef WCOREDUMP - if (WCOREDUMP(status)) - corestr = " (core dumped)"; -#endif - - if (WIFSIGNALED(status)) { - error("%s: %s%s", argv0, sigstr(status), corestr); - return WTERMSIG(status) + 128; - } - if (WEXITSTATUS(status)) - error("%s: Exit %d", argv0, WEXITSTATUS(status)); - return WEXITSTATUS(status); -} - -/* allocation option specified, spawn a script and wait for it to exit */ -static int _run_job_script (srun_job_t *job, env_t *env) -{ - int status, exitcode; - pid_t cpid; - char **argv = (remote_argv[0] ? remote_argv : NULL); - - if (opt.nprocs_set) - env->nprocs = opt.nprocs; - if (opt.cpus_set) - env->cpus_per_task = opt.cpus_per_task; - if (opt.ntasks_per_node != NO_VAL) - env->ntasks_per_node = opt.ntasks_per_node; - if (opt.ntasks_per_socket != NO_VAL) - env->ntasks_per_socket = opt.ntasks_per_socket; - if (opt.ntasks_per_core != NO_VAL) - env->ntasks_per_core = opt.ntasks_per_core; - env->distribution = opt.distribution; - env->overcommit = opt.overcommit; - env->slurmd_debug = opt.slurmd_debug; - env->labelio = opt.labelio; - env->comm_port = slurmctld_comm_addr.port; - env->comm_hostname = slurmctld_comm_addr.hostname; - if(job) { - env->select_jobinfo = job->select_jobinfo; - env->jobid = job->jobid; - env->nhosts = job->nhosts; - env->nodelist = job->nodelist; - env->task_count = _uint16_array_to_str( - job->nhosts, job->step_layout->tasks); - } - - if (setup_env(env) != SLURM_SUCCESS) - return SLURM_ERROR; - - if (!argv) { - /* - * If no arguments were supplied, spawn a shell - * for the user. - */ - argv = xmalloc(2 * sizeof(char *)); - argv[0] = _get_shell(); - argv[1] = NULL; - } - - if ((cpid = fork()) < 0) { - error("fork: %m"); - exit(1); - } - - if (cpid == 0) { - /* - * Child. - */ -#ifdef HAVE_AIX - (void) mkcrid(0); -#endif - log_fini(); - sig_unblock_signals(); - execvp(argv[0], argv); - exit(1); - } - - /* - * Parent continues. - */ - - again: - if (waitpid(cpid, &status, 0) < (pid_t) 0) { - if (errno == EINTR) - goto again; - error("waitpid: %m"); - } - - exitcode = _print_script_exit_status(xbasename(argv[0]), status); - - (void) unsetenv("SLURM_JOBID"); /* no return code on some systems */ - return exitcode; -} - static int _become_user (void) { struct passwd *pwd = getpwuid (opt.uid); @@ -1171,8 +707,8 @@ static int _run_srun_script (srun_job_t *job, char *script) */ args = xmalloc(sizeof(char *) * 1024); args[0] = script; - for (i = 0; i < remote_argc; i++) { - args[i+1] = remote_argv[i]; + for (i = 0; i < opt.argc; i++) { + args[i+1] = opt.argv[i]; } args[i+1] = NULL; execv(script, args); diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c index 1d28c43346fc5532790a9bc7e1b231e596c6ffb8..a286902229c40e2d6668641c800ff77a74d8aa03 100644 --- a/src/srun/srun_job.c +++ b/src/srun/srun_job.c @@ -65,7 +65,7 @@ #include "src/srun/srun_job.h" #include "src/srun/opt.h" #include "src/srun/fname.h" -#include "src/srun/attach.h" +#include "src/srun/debugger.h" #include "src/srun/msg.h" typedef enum {DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_FAILED} state_t; @@ -507,26 +507,14 @@ _job_create_structure(allocation_info_t *ainfo) void update_job_state(srun_job_t *job, srun_job_state_t state) { - pipe_enum_t pipe_enum = PIPE_JOB_STATE; pthread_mutex_lock(&job->state_mutex); if (job->state < state) { job->state = state; - if(message_thread) { - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - safe_write(job->forked_msg->par_msg->msg_pipe[1], - &job->state, sizeof(int)); - } pthread_cond_signal(&job->state_cond); } pthread_mutex_unlock(&job->state_mutex); return; -rwfail: - pthread_mutex_unlock(&job->state_mutex); - error("update_job_state: " - "write from srun message-handler process failed"); - } srun_job_state_t @@ -543,13 +531,8 @@ job_state(srun_job_t *job) void job_force_termination(srun_job_t *job) { - if (mode == MODE_ATTACH) { - info ("forcing detach"); - update_job_state(job, SRUN_JOB_DETACHED); - } else { - info ("forcing job termination"); - update_job_state(job, SRUN_JOB_FORCETERM); - } + info ("forcing job termination"); + update_job_state(job, SRUN_JOB_FORCETERM); client_io_handler_finish(job->client_io); } @@ -677,7 +660,6 @@ fwd_signal(srun_job_t *job, int signo, int max_threads) slurm_msg_t req; kill_tasks_msg_t msg; static pthread_mutex_t sig_mutex = PTHREAD_MUTEX_INITIALIZER; - pipe_enum_t pipe_enum = PIPE_SIGNALED; hostlist_t hl; char *name = NULL; char buf[8192]; @@ -692,12 +674,6 @@ fwd_signal(srun_job_t *job, int signo, int max_threads) slurm_mutex_lock(&job->state_mutex); job->signaled = true; slurm_mutex_unlock(&job->state_mutex); - if(message_thread) { - write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &job->signaled,sizeof(int)); - } } debug2("forward signal %d to job", signo); diff --git a/src/srun/srun_job.h b/src/srun/srun_job.h index d2afa64ffce0c95083a1a6ebd0728a7051dfae1e..5b677101667d0e088a744c793f8e383d98a4bb0a 100644 --- a/src/srun/srun_job.h +++ b/src/srun/srun_job.h @@ -86,31 +86,6 @@ typedef enum { SRUN_TASK_ABNORMAL_EXIT } srun_task_state_t; -typedef enum { - PIPE_NONE = 0, - PIPE_JOB_STATE, - PIPE_TASK_STATE, - PIPE_TASK_EXITCODE, - PIPE_HOST_STATE, - PIPE_SIGNALED, - PIPE_MPIR_DEBUG_STATE, - PIPE_UPDATE_MPIR_PROCTABLE, - PIPE_UPDATE_STEP_LAYOUT, - PIPE_NODE_FAIL -} pipe_enum_t; - -/* For Message thread */ -typedef struct forked_msg_pipe { - int msg_pipe[2]; - int pid; -} forked_msg_pipe_t; - -typedef struct forked_message { - forked_msg_pipe_t * par_msg; - forked_msg_pipe_t * msg_par; - enum job_states * job_state; -} forked_msg_t; - typedef struct io_filename io_filename_t; typedef struct srun_job { @@ -135,7 +110,7 @@ typedef struct srun_job { pthread_t sigid; /* signals thread tid */ - pthread_t jtid; /* job control thread id */ + pthread_t msg_tid; /* message thread id */ slurm_fd *jfd; /* job control info fd */ pthread_t lid; /* launch thread id */ @@ -153,7 +128,6 @@ typedef struct srun_job { io_filename_t *ifname; io_filename_t *ofname; io_filename_t *efname; - forked_msg_t *forked_msg; char *task_epilog; /* task-epilog */ char *task_prolog; /* task-prolog */ pthread_mutex_t task_mutex; diff --git a/testsuite/expect/README b/testsuite/expect/README index e9540b6975f1171f81b93efda13cbd30f7762a07..b4cf7e5d3a8836eba5e3cf7187d29bf657abdd34 100644 --- a/testsuite/expect/README +++ b/testsuite/expect/README @@ -68,18 +68,14 @@ test1.8 Confirm that basic srun stdin, stdout, and stderr options work (--input, --output, and --error option respectively). test1.9 Test of srun verbose mode (--verbose option). test1.10 Test of srun/slurmd debug mode (--debug option). -test1.11 Test of batch job and job name options (--batch and --job-name - options). -test1.12 Test of processors, memory, and temporary disk space - constraints options (--mincpus, --mem, and --tmp options). - Also test that priority zero job is not started (--hold option). +test1.11 Test job name option (--job-name). +test1.12 REMOVED test1.13 Test of immediate allocation option (--immediate option). -test1.14 Test of shared and contiguous options (--shared and --contiguous). - Also uses --batch and --hold options. Also see test1.53. +test1.14 REMOVED test1.15 Test of wait option (--wait option). test1.16 Confirm that srun buffering can be disabled (--unbuffered option). -test1.17 Confirm that srun allocation mode (--allocate option). -test1.18 Test of srun attach to existing job (--attach and --join options). +test1.17 REMOVED +test1.18 REMOVED test1.19 Test srun stdout/err file name formatting (--output and --error options with %j, %J, %n, %s and %t specifications). test1.20 Test srun stdout/err disabling (--output and --error options with @@ -104,31 +100,28 @@ test1.31 Verify that SLURM directed environment variables are processed: SLURM_STDOUTMODE. test1.32 Test of srun signal forwarding test1.33 Test of srun application exit code reporting -test1.34 Verify that command arguments get forwarded to job script - (--batch option). +test1.34 REMOVED test1.35 Test of batch job with multiple concurrent job steps test1.36 Test parallel launch of srun (e.g. "srun srun hostname") -test1.37 Confirm that node sharing flags are respected (--nodelist and - --share options). +test1.37 REMOVED test1.38 Test srun handling of SIGINT to get task status or kill the job (--quit-on-interrupt option). test1.39 Test of linux light-weight core files. -test1.40 Test of stand-alone srun resource allocation (--uid and --no-shell - options). +test1.40 REMOVED test1.41 Validate SLURM debugger infrastructure (--debugger-test option). -test1.42 Test of account number and job dependencies (--account, --begin +test1.42 Test of account number and job dependencies (--account, and --depedency options). test1.43 Test of slurm_job_will_run API, (srun --test-only option). test1.44 Read srun's stdout slowly and test for lost data. -test1.45 Test the launch of a batch job within an existing job allocation. +test1.45 REMOVED test1.46 Test srun option --kill-on-bad-exit -test1.47 Tests #SLURM entry functionality in a batch script. +test1.47 REMOVED test1.48 Test of srun mail options (--mail-type and --mail-user options). test1.49 Test of srun task-prolog and task-epilog options. test1.50 Test of running non-existant job, confirm timely termination. test1.51 Test propagation of umask to spawned tasks. test1.52 Test of hostfile logic -test1.53 Test of nice value specification (--nice option). +test1.53 REMOVED test1.54 Test of running different executables with different arguments for each task (--multi-prog option). test1.55 Make certain that srun behaves when its controlling terminal @@ -151,8 +144,7 @@ test1.83 Test of contiguous option with multiple nodes (--contiguous option). Also see test1.14. test1.84 Test of cpus-per-task option on a single node (--cpus-per-task option). -test1.85 Test of partition specification on job submission (--partition - option). +test1.85 REMOVE test1.86 Confirm node selection from within a job step on existing allocation (--nodelist, --exclude, --nodes and --nprocs options). test1.87 Confirm node selection from within a job step on existing allocation diff --git a/testsuite/expect/test1.11 b/testsuite/expect/test1.11 index e4844c231fa71a80ced024ceed9ae9513c1a11dc..75c5e4a3be7af3b5755acf006139457332d2f13c 100755 --- a/testsuite/expect/test1.11 +++ b/testsuite/expect/test1.11 @@ -36,10 +36,6 @@ source ./globals set test_id "1.11" -set file_in "test$test_id.input" -set file_out "test$test_id.output" -set file_err "test$test_id.error" -set job_name "jobname$test_id" set name_read "" set complete_flag 0 @@ -53,125 +49,6 @@ set got_login_grps 0 print_header $test_id -# -# Delete left-over input script plus stdout/err files -# Build input script file that runs two job steps -# -exec $bin_rm -f $file_in $file_out $file_err -make_bash_script $file_in " - $bin_id - $srun $bin_sleep 1 - $srun $bin_sleep 1 -" - -# -# Spawn a srun batch job that uses stdout/err and confirm their contents -# -set timeout $max_job_delay -set srun_pid [spawn $srun --batch --output=$file_out --error=$file_err --job-name=$job_name -t1 $file_in] -expect { - -re "jobid ($number) submitted" { - set job_id $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} - -if {$job_id == 0} { - send_user "\nFAILURE: batch submit failure\n" - exit 1 -} - -# -# Wait for job to complete -# -if {[wait_for_job $job_id "DONE"] != 0} { - send_user "\nFAILURE: waiting for job to complete\n" - set exit_code 1 -} - -spawn $scontrol show job $job_id -expect { - -re "Name=$job_name" { - set name_flag 1 - exp_continue - } - -re "JobState=COMPLETE" { - set complete_flag 1 - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } -} - -if {$name_flag == 0} { - send_user "\nFAILURE: batch job name failure\n" - set exit_code 1 -} - -if {$complete_flag == 0} { - send_user "\nFAILURE: batch job termination failure\n" - set exit_code 1 -} - -# -# Check user id and group id in stdout -# -spawn $bin_id -expect { - -re "(uid=.*\n)" { - set login_grp_info $expect_out(1,string) - set got_login_grps 1 - exp_continue - } - eof { - wait - } -} - -if {[wait_for_file $file_out] == 0} { - spawn $bin_cat $file_out - expect { - -re "(uid=.*\n)" { - set job_grp_info $expect_out(1,string) - set got_job_grps 1 - exp_continue - } - eof { - wait - } - } -} - -if {$got_login_grps == 0} { - send_user "\nFAILURE: Unable to get user and group ID info\n" - set exit_code 1 -} -if {$got_job_grps == 0} { - send_user "\nFAILURE: User and group ID info missing from stdout\n" - set exit_code 1 -} -if {[string compare $login_grp_info $job_grp_info] != 0} { - send_user "\nFAILURE: Login and slurm user info mismatch\n" - set exit_code 1 -} - -if {$exit_code == 0} { - exec $bin_rm -f $file_in $file_out $file_err -} - # # Spawn a srun job with a really long name and confirm it is accepted or truncated # diff --git a/testsuite/expect/test1.12 b/testsuite/expect/test1.12 deleted file mode 100755 index 2d64951bca493af9b92e1d8e5465c8adb376bfea..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.12 +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Test of processors, memory, and temporary disk space -# constraints options (--mincpus, --mem, and --tmp options). -# Also test that priority zero job is not started (--hold -# option). -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -############################################################################ -# Copyright (C) 2002-2006 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.12" -set exit_code 0 -set file_in "test$test_id.input" -set job_id 0 - -set cpu_cnt 1 -set mem_size 13 -set tmp_size 2 -set matches 0 - -print_header $test_id - -# -# Delete left-over input script -# Build input script file -# -exec $bin_rm -f $file_in -make_bash_script $file_in " - $bin_sleep 10 -" - -# -# Spawn a srun batch job with constraints and stdout/err -# -set srun_pid [spawn $srun --batch --output=none --error=none --mincpus=$cpu_cnt --mem=$mem_size --tmp=$tmp_size --hold -t1 $file_in] -expect { - -re "jobid ($number) submitted" { - set job_id $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} - -if {$job_id == 0} { - send_user "\nFAILURE: batch submit failure\n" - exit 1 -} - -# -# Confirm constraints are registered and wait for job completion -# -spawn $scontrol show job $job_id -expect { - -re "Priority=($number)" { - set read_prio $expect_out(1,string) - if {$read_prio == 0} { - incr matches - send_user "match of Priority\n" - } - exp_continue - } - -re "JobState=PENDING" { - incr matches - send_user "match of JobState\n" - exp_continue - } - -re "MinProcs=($number)" { - set read_proc $expect_out(1,string) - if {$read_proc == $cpu_cnt} { - incr matches - send_user "match of MinProcs\n" - } - exp_continue - } - -re "MinMemory=($number)" { - set read_mem $expect_out(1,string) - if {$read_mem == $mem_size} { - incr matches - send_user "match of MinMemory\n" - } - exp_continue - } - -re "MinTmpDisk=($number)" { - set read_disk $expect_out(1,string) - if {$read_disk == $tmp_size} { - incr matches - send_user "match of MinTmpDisk\n" - } - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } -} -cancel_job $job_id - -if {$matches != 5} { - send_user "\nFAILURE: Did not get proper constraints\n" - set exit_code 1 -} -if {$exit_code == 0} { - exec $bin_rm -f $file_in - send_user "\nSUCCESS\n" -} -exit $exit_code diff --git a/testsuite/expect/test1.14 b/testsuite/expect/test1.14 deleted file mode 100755 index 114ef40c7f296c09be540050131a40ffcc7ba0a1..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.14 +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Test of shared and contiguous options (--shared and --contiguous). -# Also uses --batch and --hold options. Also see test1.53. -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -############################################################################ -# Copyright (C) 2002-2006 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.14" -set exit_code 0 -set job_id 0 -set matches 0 - -print_header $test_id - -# -# Spawn a srun batch job with shared option only -# -set srun_pid [spawn $srun --share --hold --batch -t1 $bin_pwd] -expect { - -re "jobid ($number) submitted" { - set job_id $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} - -# -# Confirm shared and contiguous flag values -# -if {$job_id != 0} { - spawn $scontrol show job $job_id - expect { - -re "Shared=($number)" { - set shared_val $expect_out(1,string) - if {$shared_val == 1} { - incr matches - } - exp_continue - } - -re "Contiguous=($number)" { - set cont_val $expect_out(1,string) - if {$cont_val == 0} { - incr matches - } - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } - } - cancel_job $job_id - set job_id 0 -} else { - set exit_code 1 -} - -# -# Spawn a srun batch job with contiguous option only -# -set job_id 0 -spawn $srun --contiguous --hold --batch -t1 $bin_pwd -expect { - -re "jobid ($number) submitted" { - set job_id $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - exit 1 - } - eof { - wait - } -} - -# -# Confirm shared and contiguous flag values -# -if {$job_id != 0} { - spawn $scontrol show job $job_id - expect { - -re "Shared=($alpha)" { - set shared_val $expect_out(1,string) - if {[string compare $shared_val OK] == 0} { - incr matches - } - exp_continue - } - -re "Contiguous=($number)" { - set cont_val $expect_out(1,string) - if {$cont_val == 1} { - incr matches - } - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } - } - cancel_job $job_id -} else { - set exit_code 1 -} - -if {$matches != 4} { - send_user "\nFAILURE: Did not properly set shared and contiguous flags\n" - set exit_code 1 -} -if {$exit_code == 0} { - send_user "\nSUCCESS\n" -} -exit $exit_code diff --git a/testsuite/expect/test1.17 b/testsuite/expect/test1.17 deleted file mode 100755 index c92cbce48193ab4aacaa8382aa7fad6107b8e9c5..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.17 +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Confirm that srun allocation mode (--allocate option). -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -############################################################################ -# Copyright (C) 2002 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.17" -set exit_code 0 -set job_id 0 -set slurm_jobid 0 - -print_header $test_id - -# -# Submit a slurm allocate job -# Interactively print $SLURM_JOBID -# -set timeout $max_job_delay -set match 0 -set srun_pid [spawn $srun --allocate --verbose -t1] -expect { - -re "jobid ($number).*" { - set job_id $expect_out(1,string) - send "$bin_echo MY_ID=\$SLURM_JOBID \n" - exp_continue - } - -re "MY_ID=($number)" { - set slurm_jobid $expect_out(1,string) - send "exit 2\n" - exp_continue - } - -re "MY_ID=" { -# no environment variable - send "exit 2\n" - exp_continue - } - -re "\[Ee\]xit 2" { - send_user "This error was expected, no worries\n" - set match 1 - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} - - -# -# Confirm the job_ids match. -# -if { $job_id == 0 } { - send_user "\nFAILURE: srun --allocate failure\n" - set exit_code 1 -} else { - if { $job_id != $slurm_jobid } { - send_user "\nFAILURE: srun job_id mis-match\n" - set exit_code 1 - } -} - -# -# Confirm exit code is propogated -# -if { $match != 1 } { - send_user "\nFAILURE: srun exit code not reported\n" - set exit_code 1 -} - -if { $exit_code == 0 } { - send_user "\nSUCCESS\n" -} -exit $exit_code diff --git a/testsuite/expect/test1.18 b/testsuite/expect/test1.18 deleted file mode 100755 index 5419a058e4879d148cc71bf0793591228f9fadbe..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.18 +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Test of srun attach to existing job (--attach and --join options). -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -############################################################################ -# Copyright (C) 2002-2006 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.18" -set exit_code 0 -set file_prog "test$test_id.prog" -set job_id 0 -set matches 0 -set tasks 8 - -print_header $test_id -if { [test_bluegene] } { - set tasks 1 - set node_cnt 1-512 -} else { - set tasks 8 - set node_cnt 1-$tasks -} - -# -# Delete left-over program and rebuild it -# -exec $bin_rm -f $file_prog -exec $bin_make -f /dev/null $file_prog -exec $bin_chmod 700 $file_prog - -# -# Spawn initial program via srun -# -set timeout $max_job_delay -set srun_pid [spawn $srun -N$node_cnt -n$tasks -O -v -t5 -l $file_prog] -set init_id $spawn_id -expect { - -i $init_id - -re "launching ($number).0 on host" { - set job_id $expect_out(1,string) - exp_continue - } - -re "WAITING" { - incr matches - if {$matches < $tasks} { - exp_continue - } - } - timeout { - send_user "\nFAILURE: srun (launch) not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} -if {$job_id == 0} { - send_user "\nFAILURE: job submit failure\n" - exit 1 -} -if {$matches == 0} { - send_user "\nFAILURE: job run time failure\n" - exit 1 -} - -# -# Attach to initial program via srun -# -set matches 0 -set timeout 10 -set attach_pid [spawn $srun -vv -l --attach=$job_id --join] -set attach_id $spawn_id -expect { - -i $attach_id - -re "WAITING" { - incr matches - send_user "\nsending exit message\n" - send -i $attach_id "exit\n" - } - timeout { - send_user "\nFAILURE: srun (attach) not responding\n" - slow_kill $attach_pid - set exit_code 1 - } - eof { - wait - } -} -if {$matches == 0} { - send_user "\nFAILURE: job run time failure\n" - set exit_code 1 -} - -# -# Make sure initial program terminates too -# -# Explicitly reset spawn_id for wait call -set spawn_id $init_id -expect { - timeout { - send_user "\nFAILURE: srun (terminate) not responding\n" - set exit_code 1 - } - eof { - wait - } -} - -if {$exit_code == 0} { - exec $bin_rm -f $file_prog - send_user "\nSUCCESS\n" -} else { - cancel_job $job_id -} -exit $exit_code diff --git a/testsuite/expect/test1.19 b/testsuite/expect/test1.19 index e34b769c438246057735fefa9b6db59a9f1bdc6b..0e8ca56336a4f030cd240460943425b761c87fa0 100755 --- a/testsuite/expect/test1.19 +++ b/testsuite/expect/test1.19 @@ -241,9 +241,9 @@ if { [test_bluegene] } { } set job_id 0 -set srun_pid [spawn $srun --batch --output=/dev/null -N$node_cnt -n$task_cnt -O -t1 $file_in] +set srun_pid [spawn $sbatch --output=/dev/null -N$node_cnt -n$task_cnt -t1 $file_in] expect { - -re "jobid ($number) submitted" { + -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } diff --git a/testsuite/expect/test1.29 b/testsuite/expect/test1.29 index f8d1878f099b63f08ac941099d2e4762d2997195..680fe61cdf36ed93f9187f151b0766f8d629f8e2 100755 --- a/testsuite/expect/test1.29 +++ b/testsuite/expect/test1.29 @@ -133,9 +133,9 @@ make_bash_script $file_in " set timeout $max_job_delay -set srun_pid [spawn $srun --batch --output=$file_out --error=$file_err -t1 ./$file_in] +set srun_pid [spawn $sbatch --output=$file_out --error=$file_err -t1 ./$file_in] expect { - -re "jobid ($number) submitted" { + -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } diff --git a/testsuite/expect/test1.34 b/testsuite/expect/test1.34 deleted file mode 100755 index b2e6bb76f6c000dd6493e796b526e1293b660439..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.34 +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Verify that arguments get forwarded to job script (--batch option). -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -# -# Note: This script generates and then deletes files in the working directory -# named test1.34.input, test1.34.output, and test1.34.error -############################################################################ -# Copyright (C) 2002 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.34" -set file_in "test$test_id.input" -set file_out "test$test_id.output" -set file_err "test$test_id.error" - -set arg1 "arg_one" -set arg2 "arg_two" -set arg_match 0 -set exit_code 0 -set job_id 0 - -print_header $test_id - -# -# Delete left-over input script plus stdout/err files -# Build input script file -# -exec $bin_rm -f $file_in $file_out $file_err -make_bash_script $file_in "$bin_echo \$1,\$2" - -# -# Spawn a srun batch job with arguments -# -set timeout $max_job_delay -set srun_pid [spawn $srun --batch --output=$file_out --error=$file_err -t1 $file_in $arg1 $arg2] -expect { - -re "jobid ($number) submitted" { - set job_id $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} - -if {$job_id == 0} { - send_user "\nFAILURE: batch submit failure\n" - exit 1 -} - -# -# Wait for job to complete -# -if {[wait_for_job $job_id "DONE"] != 0} { - send_user "\nFAILURE: waiting for job to complete\n" - cancel_job $job_id - set exit_code 1 -} - -# -# Check arguments returned in stdout -# -if {[wait_for_file $file_out] == 0} { - spawn $bin_cat $file_out - expect { - -re "$arg1,$arg2" { - set arg_match 1 - exp_continue - } - eof { - wait - } - } -} - -if {$arg_match == 0} { - send_user "\nFAILURE: Failed to pass script arguments\n" - set exit_code 1 -} - -if {$exit_code == 0} { - send_user "\nSUCCESS\n" - exec $bin_rm -f $file_in $file_out $file_err -} -exit $exit_code diff --git a/testsuite/expect/test1.35 b/testsuite/expect/test1.35 index b19e81a00f46696993f784d32d6205b0713700e9..a3766adb2c9600be1569c1dc43eb19590f51f3a9 100755 --- a/testsuite/expect/test1.35 +++ b/testsuite/expect/test1.35 @@ -75,9 +75,9 @@ if { [test_bluegene] } { } } -set srun_pid [spawn $srun --batch -N$node_cnt --output=$file_out --error=$file_err -t1 $file_in] +set srun_pid [spawn $sbatch -N$node_cnt --output=$file_out --error=$file_err -t1 $file_in] expect { - -re "jobid ($number) submitted" { + -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } diff --git a/testsuite/expect/test1.37 b/testsuite/expect/test1.37 deleted file mode 100755 index a505b0a1a6a54e3657d9676c8773170416d84feb..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.37 +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Confirm that node sharing flags are respected (--nodelist and -# --share options). -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "WARNING: ..." with an explanation of why the test can't be made, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -############################################################################ -# Copyright (C) 2002 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.37" -set exit_code 0 -set file_err "test$test_id.error" -set file_in "test$test_id.input" -set file_out "test$test_id.output" -set job_id1 0 -set host_name "" -set nodelist_name "" - -print_header $test_id - -# -# Submit a job and get the node's NodeName from the nodelist -# -set timeout $max_job_delay -set srun_pid [spawn $srun -v -N1 -l -t1 $bin_hostname] -expect { - -re "on host ($alpha_numeric)," { - set nodelist_name $expect_out(1,string) - exp_continue - } - -re "0: ($alpha_numeric)" { - set host_name $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} -if {[string compare $nodelist_name ""] == 0} { - send_user "\nFAILURE: Did not get hostname of task 0\n" - exit 1 -} -if {[test_front_end] != 0} { - send_user "\nWARNING: Additional testing is incompatable with front-end systems\n" - exit $exit_code -} - -# -# Delete left-over input script -# Build input script file -# -exec $bin_rm -f $file_in -make_bash_script $file_in "$srun $bin_sleep 5" - -# -# Submit two jobs to the same node, one with no sharing, the other -# with sharing permitted. Insure the first job completes before the -# second job is started. -# -set srun_pid [spawn $srun --batch -N1 --exclusive --nodelist=$nodelist_name -t1 --output=$file_out --error=$file_err $file_in] -expect { - -re "jobid ($number) submitted" { - set job_id1 $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - exit 1 - } - eof { - wait - } -} -if {$job_id1 == 0} { - send_user "\nFAILURE: srun failed to report jobid\n" - exit 1 -} - -set partition "dummy" -set waited 1 -set timeout [expr $timeout + 5] -set srun_pid [spawn $srun -N1 --nodelist=$nodelist_name -t1 --share $scontrol -o show job $job_id1] -expect { - -re "Partition=($alpha_numeric)" { - set partition $expect_out(1,string) - exp_continue - } - -re "JobState=RUN" { - set waited 0 - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - exit 1 - } - eof { - wait - } -} -if {$waited == 0} { - spawn $scontrol show partition - expect { - -re "Shared=FORCE" { - send_user "\nWARNING: Test incompatable with Shared=FORCE\n" - set waited 1 - exp_continue - } - eof { - wait - } - } -} -if {$waited == 0} { - send_user "\nFAILURE: srun failed to wait for non-sharing job to complete\n" - set exit_code 1 -} - -if {$exit_code == 0} { - exec $bin_rm -f $file_err $file_in $file_out - send_user "\nSUCCESS\n" -} -exit $exit_code diff --git a/testsuite/expect/test1.40 b/testsuite/expect/test1.40 deleted file mode 100755 index 89e870ae8d533c0059d9e68b050be8ae7d4f6644..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.40 +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Test of stand-alone srun resource allocation (--uid and --no-shell -# options). -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -############################################################################ -# Copyright (C) 2002 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.40" -set exit_code 0 -set job_id 0 - -print_header $test_id - -# -# Submit a slurm allocate job -# -set timeout $max_job_delay -set srun_pid [spawn $srun --allocate -t1 --no-shell] -expect { - -re "SLURM_JOBID=($number).*" { - set job_id $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} -if { $job_id == 0 } { - send_user "\nFAILURE: job_id not captured\n" - exit 1 -} - -# -# The srun should have completed, confirm the job is active -# -set found_job 0 -spawn $squeue --states=running --jobs=$job_id -expect { - -re $job_id { - set found_job 1 - exp_continue - } - timeout { - send_user "\nFAILURE: squeue not responding\n" - set exit_code 1 - } - eof { - wait - } -} -if { $found_job == 0 } { - send_user "\nFAILURE: job $job_id not in run state\n" - exit 1 -} - -# -# Kill the job -# -cancel_job $job_id - -# -# Create a job allocation as some other user, namely root -# -set job_id 0 -set srun_pid [spawn $srun --allocate -t1 --no-shell --uid=0] -expect { - -re "SLURM_JOBID=($number).*" { - set job_id $expect_out(1,string) - exp_continue - } - -re "Invalid user id" { - set job_id -1 - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} -if { $job_id == 0 } { - send_user "\nFAILURE: job_id not captured\n" - exit 1 -} - -if { $job_id == -1 } { - send_user "\nNo worries, this is expected for non-privileged users\n" -} else { -# -# The srun should have completed, confirm the job is active -# - set found_job 0 - spawn $squeue --states=running --jobs=$job_id --user=root - expect { - -re $job_id { - set found_job 1 - exp_continue - } - timeout { - send_user "\nFAILURE: squeue not responding\n" - set exit_code 1 - } - eof { - wait - } - } - if { $found_job == 0 } { - send_user "\nFAILURE: job $job_id not in run state\n" - exit 1 - } - cancel_job $job_id -} - -if { $exit_code == 0 } { - send_user "\nSUCCESS\n" -} -exit $exit_code diff --git a/testsuite/expect/test1.42 b/testsuite/expect/test1.42 index 38a01776b992d06b4661ce14e58e393a988939aa..be5795cef2faaf0bdef9ec94746f9a0a006aa208 100755 --- a/testsuite/expect/test1.42 +++ b/testsuite/expect/test1.42 @@ -1,8 +1,8 @@ #!/usr/bin/expect ############################################################################ # Purpose: Test of SLURM functionality -# Test of account number and job dependencies (--account, --begin -# and --depedency options). +# Test of account number and job dependencies (--account, and +# --depedency options). # # Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR # "FAILURE: ..." otherwise with an explanation of the failure, OR @@ -52,9 +52,9 @@ make_bash_script $file_in "$bin_sleep 5" # Spawn a srun batch job that just sleeps for a while # set timeout $max_job_delay -set srun_pid [spawn $srun --batch --output=/dev/null --error=/dev/null --account=MY_ACCT -t1 $file_in] +set srun_pid [spawn $sbatch --output=/dev/null --error=/dev/null --account=MY_ACCT -t1 $file_in] expect { - -re "jobid ($number) submitted" { + -re "Submitted batch job ($number)" { set job_id1 $expect_out(1,string) exp_continue } @@ -142,100 +142,6 @@ if {$match_jobid != $job_id1} { set exit_code 1 } -# -# Submit a job to run at noon tomorrow -# -set job_id1 0 -set srun_pid [spawn $srun --batch --output=/dev/null --error=/dev/null --begin=noon-tomorrow $file_in] -expect { - -re "jobid ($number) submitted" { - set job_id1 $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} -if {$job_id1 == 0} { - send_user "\nFAILURE: batch submit failure\n" - exit 1 -} -exec $bin_sleep 5 -set match 0 -spawn $scontrol show job $job_id1 -expect { - -re "JobState=PENDING" { - incr match - exp_continue - } - -re "StartTime=($number)/($number)-12:00:00" { - incr match - exp_continue - } - -re "StartTime=($number)-($number)-($number)T12:00:00" { - incr match - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - exp_continue - } - eof { - wait - } -} -if {$match != 2} { - send_user "\nFAILURE: unexpected JobState or StartTime\n" - set exit_code 1 -} -# Reset start time and test for completion -spawn $scontrol update JobId=$job_id1 StartTime=now -expect { - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - exp_continue - } - eof { - wait - } -} -set delayed 0 -set is_done 0 -while { $delayed < $max_job_delay } { - exec $bin_sleep 10 - incr delayed +10 - spawn $scontrol show job $job_id1 - expect { - -re "JobState=COMPLETED" { - set is_done 1 - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - exp_continue - } - eof { - wait - } - } - if {$is_done == 1} { - break - } -} -if {$is_done == 0} { - send_user "\nFAILURE: unexpected JobState\n" - cancel_job $job_id1 - set exit_code 1 -} - if {$exit_code == 0} { exec $bin_rm -f $file_in diff --git a/testsuite/expect/test1.45 b/testsuite/expect/test1.45 deleted file mode 100755 index d6cbe2a1b591ff293a40b1875b8f92431538b82c..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.45 +++ /dev/null @@ -1,213 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Test the launch of a batch job within an existing job allocation. -# This logic is used by LSF -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -############################################################################ -# Copyright (C) 2005-2006 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.45" -set file_in "test$test_id.input" -set file_out1 "test$test_id.output1" -set file_out2 "test$test_id.output2" -set exit_code 0 -set job_id_0 0 -set job_id_1 0 -set job_id_2 0 - -print_header $test_id - -# -# Delete left-over stdout/err files -# -exec $bin_rm -f $file_in $file_out1 $file_out2 - -# -# Build input script file -# -make_bash_script $file_in " - $bin_id - $bin_sleep 20" - -# -# Spawn a srun batch job that uses stdout/err and confirm their contents -# -if { [test_bluegene] } { - set node_cnt 1-2048 -} else { - if { [test_xcpu] } { - set node_cnt 1-1 - } else { - set node_cnt 1-4 - } -} - -set timeout $max_job_delay -set srun_pid [spawn $srun -N$node_cnt -A -v -t1] -expect { - -re "jobid ($number):" { - set job_id_0 $expect_out(1,string) - send "$srun -b --jobid=$job_id_0 -o $file_out1 $file_in \n" - exp_continue - } - -re "jobid ($number).0 submitted" { - set job_id_1 $expect_out(1,string) - send "$srun -b --jobid=$job_id_0 -o $file_out2 $bin_id \n" - exp_continue - } - -re "jobid ($number).1 submitted" { - set job_id_2 $expect_out(1,string) - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - exp_continue - } - eof { - wait - } -} - -if {$job_id_0 == 0} { - send_user "\nFAILURE: job allocation failure\n" - slow_kill $srun_pid - exit 1 -} -if {($job_id_1 == 0) || ($job_id_2 == 0)} { - send_user "\nFAILURE: batch job submit failure\n" - cancel_job $job_id_0 - exit 1 -} - -if {($job_id_0 != $job_id_2) || ($job_id_1 != $job_id_2)} { - send_user "\nFAILURE: batch job did not run in existing allocation\n" - cancel_job $job_id_0 - cancel_job $job_id_1 - cancel_job $job_id_2 - exit 1 -} - -# -# Check that the job step is reported -# -set matches 0 -spawn $scontrol show step $job_id_0.0 -expect { - -re "Invalid" { - send_user "\nFAILURE: batch step not found\n" - set matches 1 - set exit_code 1 - exp_continue - } - -re "$job_id_0.0" { - set matches 1 - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - set exit_code 1 - } - eof { - wait - } -} -if {$matches == 0} { - send_user "\nFAILURE: batch step not found\n" - set exit_code 1 -} - -# -# Check batch job step output -# -if {[wait_for_file $file_out1] == 0} { - set matches 0 - spawn $bin_cat $file_out1 - expect { - -re "uid=" { - set matches 1 - exp_continue - } - eof { - wait - } - } - if {$matches == 0} { - send_user "\nFAILURE: Job output missing\n" - set exit_code 1 - } -} -if {[wait_for_file $file_out2] == 0} { - set matches 0 - spawn $bin_cat $file_out2 - expect { - -re "uid=" { - set matches 1 - exp_continue - } - -re "srun.*command not found" { - send_user "\nWARNING: srun is not installed on this computer\n" - set matches 1 - exp_continue - } - eof { - wait - } - } - if {$matches == 0} { - send_user "\nFAILURE: Job output missing\n" - set exit_code 1 - } -} - -# -# Make sure job is still active, then cancel it -# -set matches 0 -spawn $scontrol -o show job $job_id_0 -expect { - -re "JobState=RUNNING" { - set matches 1 - exp_continue - } - eof { - wait - } -} -if {$matches == 0} { - send_user "\nFAILURE: Job not still running\n" - set exit_code 1 -} -cancel_job $job_id_0 - -if {$exit_code == 0} { - send_user "\nSUCCESS\n" - exec $bin_rm -f $file_in $file_out1 $file_out2 -} -exit $exit_code diff --git a/testsuite/expect/test1.47 b/testsuite/expect/test1.47 deleted file mode 100755 index 7c6b8bfe18076b13f1587978e96477ed913434c9..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.47 +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Tests #SLURM entry functionality in a batch script. -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -############################################################################ -# Copyright (C) 2005-2006 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Danny Auble <da@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.47" -set exit_code 0 -set file_in "test$test_id.input" -set file_out "test$test_id.output" -set job_acct "TEST_ACCT" -set job_name "TEST_NAME" -set delay 1 - -print_header $test_id - -make_bash_script $file_in " - #SLURM --job-name=$job_name - #SLURM --account=$job_acct - $bin_sleep $delay -" - -set timeout $max_job_delay -set job_id 0 -set srun_pid [spawn $srun -o $file_out -b $file_in] -expect { - -re "jobid ($number) submitted" { - set job_id $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - exp_continue - } - eof { - wait - } -} -if {$job_id == 0} { - send_user "\nFAILURE: batch submit failure\n" - exit 1 -} -set matches 0 -spawn $scontrol show job $job_id -expect { - -re "Name=$job_name" { - incr matches - exp_continue - } - -re "Account=$job_acct" { - incr matches - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - exp_continue - } - eof { - wait - } -} -if {$matches != 2} { - send_user "\nFAILURE: did not set job name and account from batch script\n" - set exit_code 1 -} - -# -# Build input script file -# NOTE: The initial sleep is so that all of the submissions have time -# to occur before contending with a multitude of job step creations. -# This is especially important on very slow systems (e.g. AIX). -# -make_bash_script $file_in " - #SLURM -N1000000k - $bin_sleep $delay -" - -set matches 0 -set srun_pid [spawn $srun -o $file_out -b $file_in] -expect { - -re "More .* requested than permitted" { - send_user "This error was expected, no worries\n\n" - incr matches - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} -if {$matches != 1} { - send_user "\nFAILURE: srun didn't read the correct options from batch file\n" - set exit_code 1 -} - -make_bash_script $file_in " - #SLURM -N650000 - $bin_sleep $delay -" - -set srun_pid [spawn $srun -N1 -o $file_out -b $file_in] -expect { - -re "More nodes requested than permitted" { - send_user "\nFAILURE: srun read from the batch file options" - send_user "over writing the commandline options\n" - set exit_code 1 - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} - -# -# Post-processing -# -if {$exit_code == 0} { - exec $bin_rm -f $file_in $file_out - send_user "\nSUCCESS\n" -} -exit $exit_code diff --git a/testsuite/expect/test1.53 b/testsuite/expect/test1.53 deleted file mode 100755 index 8f033ed3f7f50910e04c2aab2e9474fac99c4473..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.53 +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Test of nice value specification (--nice option). -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -############################################################################ -# Copyright (C) 2005 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.53" -set exit_code 0 -set file_in "test$test_id.input" -set job_id1 0 -set job_id2 0 -set job_id3 0 -set job_prio1 0 -set job_prio2 0 -set job_prio3 0 - -print_header $test_id - -# -# Build input script file -# -make_bash_script $file_in "$bin_sleep 60" - -# -# Submit three jobs with differing nice values -# -set srun_pid [spawn $srun --batch --output=/dev/null --error=/dev/null -t2 $file_in] -expect { - -re "jobid ($number) submitted" { - set job_id1 $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - exit 1 - } - eof { - wait - } -} -if {$job_id1 == 0} { - send_user "\nFAILURE: srun submit failed\n" - exit 1 -} - -set srun_pid [spawn $srun --batch --output=/dev/null --error=/dev/null -t2 --nice $file_in] -expect { - -re "jobid ($number) submitted" { - set job_id2 $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - cancel_job $job_id1 - slow_kill $srun_pid - exit 1 - } - eof { - wait - } -} -if {$job_id2 == 0} { - send_user "\nFAILURE: srun submit failed\n" - cancel_job $job_id1 - exit 1 -} -set srun_pid [spawn $srun --batch --output=/dev/null --error=/dev/null -t2 --nice=200 $file_in] -expect { - -re "jobid ($number) submitted" { - set job_id3 $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - cancel_job $job_id1 - cancel_job $job_id2 - slow_kill $srun_pid - exit 1 - } - eof { - wait - } -} - -exec $bin_rm -f $file_in - -# -# Get the priority of each job job with scontrol -# -spawn $scontrol show job $job_id1 -expect { - -re "Priority=($number)" { - set job_prio1 $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } -} -spawn $scontrol show job $job_id2 -expect { - -re "Priority=($number)" { - set job_prio2 $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } -} -spawn $scontrol show job $job_id3 -expect { - -re "Priority=($number)" { - set job_prio3 $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } -} - -# -# Make sure the job priorities are as expected -# -if {$job_prio1 == 0 || $job_prio2 == 0 || $job_prio3 == 0} { - send_user "\nFAILURE: failed to job priorities of each submitted job\n" - set exit_code 1 -} else { - set diff2 [expr $job_prio1 - $job_prio2] - set diff3 [expr $job_prio1 - $job_prio3] -# Target for diff2 is 101 - if {$diff2 < 91 || $diff2 > 111} { - send_user "\nFAILURE: job2 priority delta bad $diff2\n" - set exit_code 1 - } -# Target for diff3 is 202 - if {$diff3 < 192 || $diff3 > 212} { - send_user "\nFAILURE: job3 priority delta bad $diff3\n" - set exit_code 1 - } -} - -cancel_job $job_id1 -cancel_job $job_id2 -cancel_job $job_id3 -if {$exit_code == 0} { - send_user "\nSUCCESS\n" -} -exit $exit_code - diff --git a/testsuite/expect/test1.58 b/testsuite/expect/test1.58 index 917b978205a70d2202c587495378a2bf55ee6c35..e5bfe5703c17c1bb4abc12a5e46beaf95a868555 100755 --- a/testsuite/expect/test1.58 +++ b/testsuite/expect/test1.58 @@ -42,10 +42,10 @@ set timeout $max_job_delay # Run an srun to grab a single node allocation, but not start any # job steps. # -set srun_alloc_pid [spawn $srun -v -N1 -n1 -A $bin_sleep 600] +set srun_alloc_pid [spawn $salloc -v -N1 -n1 $bin_sleep 600] set srun_alloc_sid $spawn_id expect { - -re "srun: jobid ($number)" { + -re "salloc: Granted job allocation ($number)" { set jobid $expect_out(1,string) } timeout { @@ -90,7 +90,7 @@ if {$got_pattern == 0} { } # -# Release the allocation by killing the first srun (really it kills the "sleep") +# Release the allocation by killing salloc (really it kills the "sleep") # cancel_job $jobid set spawn_id $srun_alloc_sid diff --git a/testsuite/expect/test1.59 b/testsuite/expect/test1.59 index 9013f8264733acc40ca107d580bbe856f3e1cae6..0c87934c448c78655529e233220667ad0d1e1820 100755 --- a/testsuite/expect/test1.59 +++ b/testsuite/expect/test1.59 @@ -84,9 +84,9 @@ set node3 0 set node4 0 set timeout $max_job_delay -spawn $srun -N$num_nodes -A -v bash +spawn $salloc -N$num_nodes -v bash expect { - -re "jobid ($number):" { + -re "salloc: Granted job allocation ($number):" { set job_id $expect_out(1,string) exp_continue } diff --git a/testsuite/expect/test1.7 b/testsuite/expect/test1.7 index 21fa91d47de540b134a5d725d2a93699d9cd4b8d..6f09463c6709e638c005072a72562adad6e9ad1b 100755 --- a/testsuite/expect/test1.7 +++ b/testsuite/expect/test1.7 @@ -171,59 +171,6 @@ if {$completions != 1} { set exit_code 1 } -# -# Spawn a srun batch job with arguments -# -set timeout $max_job_delay -set srun_pid [spawn $srun --batch --output=$file_out --error=$file_err -t4 ./$file_in] -expect { - -re "jobid ($number) submitted" { - set job_id $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} - -if {$job_id == 0} { - send_user "\nFAILURE: batch submit failure\n" - exit 1 -} - -# -# Wait for job to complete and check output -# -set output_fini 0 -if {[wait_for_job $job_id "DONE"] != 0} { - send_user "\nFAILURE: waiting for job to complete\n" - set exit_code 1 -} - -if {[wait_for_file $file_out] == 0} { - spawn $bin_cat $file_out - expect { - -re "FINI" { - set output_fini 1 - exp_continue - } - eof { - wait - } - } -} - -if {$output_fini == 0} { - send_user "\nFAILURE: Unexpected batch job output, " - send_user "possible premature job termination\n" - set exit_code 1 -} - if {$exit_code == 0} { exec $bin_rm -f $file_in $file_out $file_err send_user "\nSUCCESS\n" diff --git a/testsuite/expect/test1.85 b/testsuite/expect/test1.85 deleted file mode 100755 index dbf0fc2fa4e5a7abc713e06505e1b4d2464af615..0000000000000000000000000000000000000000 --- a/testsuite/expect/test1.85 +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/expect -############################################################################ -# Purpose: Test of SLURM functionality -# Test of partition specification on job submission (--partition -# option). -# -# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR -# "WARNING: ..." with an explanation of why the test can't be made, OR -# "FAILURE: ..." otherwise with an explanation of the failure, OR -# anything else indicates a failure mode that must be investigated. -############################################################################ -# Copyright (C) 2002 The Regents of the University of California. -# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -# Written by Morris Jette <jette1@llnl.gov> -# UCRL-CODE-226842. -# -# This file is part of SLURM, a resource management program. -# For details, see <http://www.llnl.gov/linux/slurm/>. -# -# SLURM is free software; you can redistribute it and/or modify it under -# the terms of the GNU General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) -# any later version. -# -# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License along -# with SLURM; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -############################################################################ -source ./globals - -set test_id "1.85" -set def_part_name "" -set exit_code 0 -set file_in "test$test_id.input" -set job_id 0 -set other_part_name "" - -print_header $test_id - -# -# Identify the partitions in the cluster, identifying the default -# -spawn $sinfo --summarize -expect { - -re "($end_of_line)($alpha_numeric)(\[ \*\]) *up" { - if (![string compare $expect_out(3,string) "*"]) { - set def_part_name $expect_out(2,string) - } else { - set other_part_name $expect_out(2,string) - } - exp_continue - } - -re "Unable to contact" { - send_user "\nFAILURE: slurm appears to be down\n" - exit 1 - } - timeout { - send_user "\nFAILURE: sinfo not responding\n" - set exit_code 1 - } - eof { - wait - } -} - -# -# Build input script file -# -make_bash_script $file_in "$srun $bin_sleep $max_job_delay" - -# -# Submit a batch job explicitly to the default partition -# -set job_id 0 -set srun_pid [spawn $srun --batch --output=/dev/null --error=/dev/null --hold --partition=$def_part_name -t1 $file_in] -expect { - -re "jobid ($number) submitted" { - set job_id $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} -# Confirm the job's partition -if {$job_id == 0} { - send_user "\nFAILURE: batch submit failure\n" - set exit_code 1 -} else { - set read_part "" - spawn $scontrol show job $job_id - expect { - -re "Partition=($alpha_numeric)" { - set read_part $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } - } - if ([string compare $read_part $def_part_name]) { - send_user "\nFAILURE: Improper partition selected\n" - set exit_code 1 - } - cancel_job $job_id -} - -# -# Test if a non-default partition exists, terminate if none -# -if (![string compare $other_part_name ""]) { - send_user "\nWARNING: can't test srun partition option" - send_user " only the default partition exists\n" - exec $bin_rm -f $file_in - exit $exit_code -} - -# -# Submit job explicitly to a non-default partition -# -set job_id 0 -set srun_pid [spawn $srun --batch --output=/dev/null --error=/dev/null --hold --partition=$other_part_name -t1 $file_in] -expect { - -re "jobid ($number) submitted" { - set job_id $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: srun not responding\n" - slow_kill $srun_pid - set exit_code 1 - } - eof { - wait - } -} -exec $bin_rm -f $file_in -# Confirm the job's partition -if {$job_id == 0} { - send_user "\nFAILURE: batch submit failure\n" - set exit_code 1 -} else { - set read_part "" - spawn $scontrol show job $job_id - expect { - -re "Partition=($alpha_numeric)" { - set read_part $expect_out(1,string) - exp_continue - } - timeout { - send_user "\nFAILURE: scontrol not responding\n" - set exit_code 1 - } - eof { - wait - } - } - if ([string compare $read_part $other_part_name]) { - send_user "\nFAILURE: Improper partition selected\n" - set exit_code 1 - } - cancel_job $job_id -} - -if {$exit_code == 0} { - send_user "\nSUCCESS\n" -} -exit $exit_code diff --git a/testsuite/expect/test1.87 b/testsuite/expect/test1.87 index be5f10717f5c11e181f79259268e2eb827e66143..a6e3c0db4ff9155c442ae3ae4d98372ee50468b8 100755 --- a/testsuite/expect/test1.87 +++ b/testsuite/expect/test1.87 @@ -57,7 +57,7 @@ make_bash_script $file_in " # Submit a 4 node job # set timeout $max_job_delay -set srun_pid [spawn $srun -N4 -A $file_in] +set srun_pid [spawn $salloc -N4 ./$file_in] expect { -re "More ($alpha) requested than permitted" { send_user "\nWARNING: can't test srun task distribution\n" diff --git a/testsuite/expect/test1.92 b/testsuite/expect/test1.92 index c742424d3ca55a995d3ed5e461c42c2be09e1b36..e47c21e802c39f0f179d843c03ad9c807fec3b95 100644 --- a/testsuite/expect/test1.92 +++ b/testsuite/expect/test1.92 @@ -51,7 +51,7 @@ exit 0 # # Create an allocation # -set srun_pid [spawn $srun --allocate -N2 --verbose -t2] +set srun_pid [spawn $salloc -N2 --verbose -t2 bash] expect { -re "More ($alpha) requested than permitted" { send_user "\nWARNING: can't test srun task distribution\n"