From 6dffdb9a60ecf366a9454ff46195c52926055b8f Mon Sep 17 00:00:00 2001 From: "Christopher J. Morrone" <morrone2@llnl.gov> Date: Thu, 20 Jul 2006 03:17:02 +0000 Subject: [PATCH] Changes to salloc: - returns the return code of its child command - add timeout for limiting how long it waits for a PENDING job allocation (slurm_allocate_resources_blocking was updated to timeout correctly) - cleans up job allocation on Ctrl-C (Actually, it ignores SIGINT, SIGTERM, SIGQUIT, but unblocks them for the child, so ctrl-c from the shell hits the whole process group, and the child exits, and salloc cleans up like normal. Might to be more proactive with signals, but I am not yet certain.) - trim more unnecessary command line options - man page additions --- doc/man/man1/salloc.1 | 13 +++++++---- slurm/slurm.h.in | 3 ++- src/api/allocate.c | 36 ++++++++++++++++++----------- src/salloc/opt.c | 53 ++++--------------------------------------- src/salloc/opt.h | 3 --- src/salloc/salloc.c | 43 ++++++++++++++++++++++++----------- 6 files changed, 68 insertions(+), 83 deletions(-) diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index c048d2fd8da..be61083b0da 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -14,15 +14,20 @@ The command may be any program the user wishes. Some typical commands are xterm .SH "OPTIONS" .LP .TP -\fB\-\-jobid\fR <\fIJOBID\fP> +\fB\-\-jobid\fR[=]<\fIJOBID\fP> The job allocation under which the parallel application should be launched. .TP -\fB\-n\fR, \fB\-\-ntasks\fR=<\fInumber\fR> +\fB\-n\fR, \fB\-\-ntasks\fR[=]<\fInumber\fR> Specify the number of processes to launch. The default is one process per node, but note that the \-c parameter will change this default. .TP -\fB\-N\fR, \fB\-\-nodes\fR=<\fInumber|[min]\-[max]\fR> +\fB\-N\fR, \fB\-\-nodes\fR[=]<\fInumber|[min]\-[max]\fR> Specify the number of nodes to be used by this job step. This option accepts either a single number, or a range of possible node counts. If a single number is used, such as "\-N 4", then the allocation is asking for four and ONLY four nodes. If a range is specified, such as "\-N 2\-6", SLURM controller may grant salloc anywhere from 2 to 6 nodes. When using a range, either of the min or max options may be omitted. For instance, "\-N 10\-" means "no fewer than 10 nodes", and "\-N \-20" means "no more than 20 nodes". The default value of this option is one node, but other options may require more than one node to be allocated. - +.TP +\fB\-I\fR,\fB\-\-immediate\fR +Do not wait for the resources need to grant this allocation. Normally salloc will wait for the resources necessary to satisfy the requested job allocation, but when \-\-immediate is specified it will exit immediately without running the \fIcommand\fR parameter. +.TP +\fB\-W\fR, \fB\-\-wait\fR[=]<\fIseconds\fR> +If the resources needed to satisy a job allocation are not immediately available, the job allocation is enqueued and is said to be PENDING. This option tells salloc how long (in seconds) to wait for the allocation to be granted before giving up. When the wait limit has been reached, salloc will exit without running the \fIcommand\fR parameter. By default, salloc will wait indefinitely. (The \-\-immediate option makes \-\-wait moot.) .TP \fB\-c\fR, \fB\-\-cpus\-per\-task\fR=<\fIncpus\fR> diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 8249b089d49..87f79562c89 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -761,7 +761,8 @@ extern int slurm_allocate_resources PARAMS(( * A timeout of zero will wait indefinitely. * * RET allocation structure on success, NULL on error set errno to - * indicate the error + * indicate the error (errno will be ETIMEDOUT if the timeout is reached + * with no allocation granted) * NOTE: free the allocation structure using * slurm_free_resource_allocation_response_msg */ diff --git a/src/api/allocate.c b/src/api/allocate.c index b59051a09d7..d602c52de8d 100644 --- a/src/api/allocate.c +++ b/src/api/allocate.c @@ -146,12 +146,14 @@ slurm_allocate_resources (job_desc_msg_t *req, * A timeout of zero will wait indefinitely. * * RET allocation structure on success, NULL on error set errno to - * indicate the error + * indicate the error (errno will be ETIMEDOUT if the timeout is reached + * with no allocation granted) * NOTE: free the allocation structure using * slurm_free_resource_allocation_response_msg */ resource_allocation_response_msg_t * -slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, time_t timeout) +slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, + time_t timeout) { int rc; slurm_msg_t req_msg; @@ -236,14 +238,14 @@ slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, time_t timeou /* no, we need to wait for a response */ job_id = resp->job_id; slurm_free_resource_allocation_response_msg(resp); - verbose("Pending job allocation %u", job_id); + info("Pending job allocation %u", job_id); resp = _wait_for_allocation_response(job_id, listen, timeout); /* If NULL, we didn't get the allocation in the time desired, so just free the job id */ if (resp == NULL) { + errnum = errno; slurm_complete_job(job_id, -1); - errnum = -1; } } break; @@ -687,11 +689,12 @@ _wait_for_alloc_rpc(const listen_t *listen, int sleep_time, resource_allocation_response_msg_t **resp) { struct pollfd fds[1]; + int rc; fds[0].fd = listen->fd; fds[0].events = POLLIN; - while (poll (fds, 1, (sleep_time * 1000)) < 0) { + while ((rc = poll(fds, 1, (sleep_time * 1000))) < 0) { switch (errno) { case EAGAIN: case EINTR: @@ -708,20 +711,25 @@ _wait_for_alloc_rpc(const listen_t *listen, int sleep_time, } } - if (fds[0].revents & POLLIN) + if (rc == 0) { /* poll timed out */ + errno = ETIMEDOUT; + } else if (fds[0].revents & POLLIN) { return (_accept_msg_connection(listen->fd, resp)); + } - return (0); + return 0; } static resource_allocation_response_msg_t * _wait_for_allocation_response(uint32_t job_id, const listen_t *listen, int timeout) { - resource_allocation_response_msg_t *resp; + resource_allocation_response_msg_t *resp = NULL; + int errnum; - debug ("job %u queued and waiting for resources", job_id); + debug("job %u queued and waiting for resources", job_id); if (_wait_for_alloc_rpc(listen, timeout, &resp) <= 0) { + errnum = errno; /* Maybe the resource allocation response RPC got lost * in the mail; surely it should have arrived by now. * Let's see if the controller thinks that the allocation @@ -730,10 +738,12 @@ _wait_for_allocation_response(uint32_t job_id, const listen_t *listen, if (slurm_allocation_lookup(job_id, &resp) >= 0) return resp; - if (slurm_get_errno() == ESLURM_JOB_PENDING) - debug3 ("Still waiting for allocation"); - else { - debug3 ("Unable to confirm allocation for job %u: %m", + if (slurm_get_errno() == ESLURM_JOB_PENDING) { + debug3("Still waiting for allocation"); + errno = errnum; + return NULL; + } else { + debug3("Unable to confirm allocation for job %u: %m", job_id); return NULL; } diff --git a/src/salloc/opt.c b/src/salloc/opt.c index acfda5da907..a5c4eff1155 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -84,7 +84,6 @@ #define OPT_CONN_TYPE 0x08 #define OPT_NO_ROTATE 0x0a #define OPT_GEOMETRY 0x0b -#define OPT_MPI 0x0c #define OPT_CPU_BIND 0x0d #define OPT_MEM_BIND 0x0e @@ -100,7 +99,6 @@ #define LONG_OPT_CONT 0x109 #define LONG_OPT_UID 0x10a #define LONG_OPT_GID 0x10b -#define LONG_OPT_MPI 0x10c #define LONG_OPT_CORE 0x10e #define LONG_OPT_NOSHELL 0x10f #define LONG_OPT_DEBUG_TS 0x110 @@ -742,10 +740,9 @@ static void _opt_default() opt.no_requeue = false; opt.noshell = false; - opt.max_wait = slurm_get_wait_time(); + opt.max_wait = 0; opt.quit_on_intr = false; - opt.disable_status = false; opt.test_only = false; opt.quiet = 0; @@ -815,8 +812,6 @@ env_vars_t env_vars[] = { {"SLURM_PARTITION", OPT_STRING, &opt.partition, NULL }, {"SLURM_TIMELIMIT", OPT_INT, &opt.time_limit, NULL }, {"SLURM_WAIT", OPT_INT, &opt.max_wait, NULL }, - {"SLURM_DISABLE_STATUS",OPT_INT, &opt.disable_status,NULL }, - {"SLURM_MPI_TYPE", OPT_MPI, NULL, NULL }, {NULL, 0, NULL, NULL} }; @@ -919,15 +914,6 @@ _process_env_var(env_vars_t *e, const char *val) e->var, val); } break; - - case OPT_MPI: - if (srun_mpi_init((char *)val) == SLURM_ERROR) { - fatal("\"%s=%s\" -- invalid MPI type, " - "--mpi=list for acceptable types.", - e->var, val); - } - break; - default: /* do nothing */ break; @@ -962,7 +948,6 @@ void set_options(const int argc, char **argv, int first) { int opt_char, option_index = 0; static bool set_name=false; - struct utsname name; static struct option long_options[] = { {"cpus-per-task", required_argument, 0, 'c'}, {"constraint", required_argument, 0, 'C'}, @@ -988,15 +973,12 @@ void set_options(const int argc, char **argv, int first) {"nodelist", required_argument, 0, 'w'}, {"wait", required_argument, 0, 'W'}, {"exclude", required_argument, 0, 'x'}, - {"disable-status", no_argument, 0, 'X'}, - {"no-allocate", no_argument, 0, 'Z'}, {"contiguous", no_argument, 0, LONG_OPT_CONT}, {"exclusive", no_argument, 0, LONG_OPT_EXCLUSIVE}, {"cpu_bind", required_argument, 0, LONG_OPT_CPU_BIND}, {"mem_bind", required_argument, 0, LONG_OPT_MEM_BIND}, {"mincpus", required_argument, 0, LONG_OPT_MINCPU}, {"mem", required_argument, 0, LONG_OPT_MEM}, - {"mpi", required_argument, 0, LONG_OPT_MPI}, {"no-shell", no_argument, 0, LONG_OPT_NOSHELL}, {"tmp", required_argument, 0, LONG_OPT_TMP}, {"msg-timeout", required_argument, 0, LONG_OPT_TIMEO}, @@ -1021,7 +1003,7 @@ void set_options(const int argc, char **argv, int first) {NULL, 0, 0, 0} }; char *opt_string = "+a:c:C:g:HIJ:km:N:" - "Op:P:qQR:st:U:vVw:W:x:XZ"; + "Op:P:qQR:st:U:vVw:W:x:"; if(opt.progname == NULL) opt.progname = xbasename(argv[0]); @@ -1185,15 +1167,6 @@ void set_options(const int argc, char **argv, int first) if (!_valid_node_list(&opt.exc_nodes)) exit(1); break; - case (int)'X': - opt.disable_status = true; - break; - case (int)'Z': - opt.no_alloc = true; - uname(&name); - if (strcasecmp(name.sysname, "AIX") == 0) - opt.network = xstrdup("ip"); - break; case LONG_OPT_CONT: opt.contiguous = true; break; @@ -1221,13 +1194,6 @@ void set_options(const int argc, char **argv, int first) exit(1); } break; - case LONG_OPT_MPI: - if (srun_mpi_init((char *)optarg) == SLURM_ERROR) { - fatal("\"--mpi=%s\" -- long invalid MPI type, " - "--mpi=list for acceptable types.", - optarg); - } - break; case LONG_OPT_NOSHELL: opt.noshell = true; break; @@ -1381,16 +1347,6 @@ static bool _opt_verify(void) verified = false; } - if (opt.no_alloc && !opt.nodelist) { - error("must specify a node list with -Z, --no-allocate."); - verified = false; - } - - if (opt.no_alloc && opt.exc_nodes) { - error("can not specify --exclude list with -Z, --no-allocate."); - verified = false; - } - if (opt.mincpus < opt.cpus_per_task) opt.mincpus = opt.cpus_per_task; @@ -1685,10 +1641,9 @@ static void _help(void) " (type = block|cyclic|hostfile)\n" " -J, --job-name=jobname name of job\n" " --mpi=type type of MPI being used\n" -" -W, --wait=sec seconds to wait after first task exits\n" -" before killing job\n" +" -W, --wait=sec seconds to wait for allocation if not\n" +" immediately available\n" " -q, --quit-on-interrupt quit on single Ctrl-C\n" -" -X, --disable-status Disable Ctrl-C status feature\n" " -v, --verbose verbose mode (multiple -v's increase verbosity)\n" " -Q, --quiet quiet mode (suppress informational messages)\n" " -d, --slurmd-debug=level slurmd debug level\n" diff --git a/src/salloc/opt.h b/src/salloc/opt.h index c81e3ea98d4..77659199308 100644 --- a/src/salloc/opt.h +++ b/src/salloc/opt.h @@ -81,7 +81,6 @@ typedef struct salloc_options { enum task_dist_states distribution; /* --distribution=, -m dist */ char *job_name; /* --job-name=, -J name */ - char *mpi_type; /* --mpi=type */ unsigned int dependency;/* --dependency, -P jobid */ int nice; /* --nice */ char *account; /* --account, -U acct_name */ @@ -96,7 +95,6 @@ typedef struct salloc_options { bool share; /* --share, -s */ int max_wait; /* --wait, -W */ bool quit_on_intr; /* --quit-on-interrupt, -q */ - bool disable_status; /* --disable-status, -X */ int quiet; bool test_only; /* --test-only */ char *propagate; /* --propagate[=RLIMIT_CORE,...]*/ @@ -109,7 +107,6 @@ typedef struct salloc_options { bool contiguous; /* --contiguous */ char *nodelist; /* --nodelist=node1,node2,... */ char *exc_nodes; /* --exclude=node1,node2,... -x */ - bool no_alloc; /* --no-allocate, -Z */ int max_launch_time; /* Undocumented */ int max_exit_timeout; /* Undocumented */ int msg_timeout; /* Undocumented */ diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 749cb70d51b..bd60e94a927 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -42,6 +42,7 @@ #include "src/common/xstring.h" #include "src/common/xmalloc.h" +#include "src/common/xsignal.h" #include "src/common/read_config.h" #include "src/salloc/opt.h" @@ -49,7 +50,9 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc); static void ring_terminal_bell(void); -static void run_command(void); +static int run_command(char **command); + +static int signals_to_block[] = {SIGINT, SIGTERM, SIGQUIT, 0}; int main(int argc, char *argv[]) { @@ -58,6 +61,7 @@ int main(int argc, char *argv[]) resource_allocation_response_msg_t *alloc; time_t before, after; salloc_msg_thread_t *msg_thr; + int rc; log_init(xbasename(argv[0]), logopt, 0, NULL); if (initialize_and_process_args(argc, argv) < 0) { @@ -71,6 +75,8 @@ int main(int argc, char *argv[]) log_alter(logopt, 0, NULL); } + xsignal_block(signals_to_block); + /* * Request a job allocation */ @@ -86,7 +92,7 @@ int main(int argc, char *argv[]) verbose("other_hostname = %s", desc.other_hostname); before = time(NULL); - alloc = slurm_allocate_resources_blocking(&desc, 0); + alloc = slurm_allocate_resources_blocking(&desc, opt.max_wait); if (alloc == NULL) fatal("Failed to allocate resources: %m"); after = time(NULL); @@ -106,7 +112,7 @@ int main(int argc, char *argv[]) */ setenvfs("SLURM_JOBID=%d", alloc->job_id); setenvfs("SLURM_NNODES=%d", alloc->node_cnt); - run_command(); + rc = run_command(command_argv); /* * Relinquish the job allocation. @@ -119,7 +125,7 @@ int main(int argc, char *argv[]) slurm_free_resource_allocation_response_msg(alloc); msg_thr_destroy(msg_thr); - return 0; + return rc; } @@ -128,7 +134,7 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc) { desc->contiguous = opt.contiguous ? 1 : 0; desc->features = opt.constraints; - desc->immediate = opt.immediate; + desc->immediate = opt.immediate ? 1 : 0; desc->name = opt.job_name; desc->req_nodes = opt.nodelist; if (desc->req_nodes == NULL) { @@ -214,25 +220,36 @@ static void ring_terminal_bell(void) fflush(stdout); } -static void run_command(void) +/* returns the exit status of the command */ +static int run_command(char **command) { pid_t pid; int status; - int rc; + int rc = 0; pid = fork(); if (pid < 0) { error("fork failed: %m"); } else if (pid > 0) { /* parent */ - while ((rc = waitpid(pid, &status, 0)) == -1) { - if (errno != EINTR) { - error("waitpid failed: %m"); - break; - } + while ((rc = waitpid(pid, &status, 0)) == -1 + && errno == EINTR) { + /* just keep spinning */ + } + if (rc == -1) { + error("waitpid for %s failed: %m", command[0]); + rc = 1; + } else { + if (WIFEXITED(status)) + rc = WEXITSTATUS(status); + else + rc = 1; } } else { /* child */ - execvp(command_argv[0], command_argv); + xsignal_unblock(signals_to_block); + execvp(command[0], command); } + + return rc; } -- GitLab